/*
 *
 * NIST SHA-3 Competition Submission
 *
 * Sgail Hash Function
 *
 * Development Implementation - to produce intermediate values
 *
* v 0.4 : 20090114
 * 
 * Peter Maxwell : peter@allicient.co.uk
 *
 * 
 * Changelog : 
 *	- Principle key derivation now uses a left, right and combined preliminary key
 *	- Fixed a problem in some clean-up code where keys weren't getting zero'd
 *	- Fixed implementation error in do__quad_diffuse__q2
 *	- Fixed error where key extract x4 and x2 weren't following spec
 * 
 *
 */

/*
 * When compiling, must link in nist_v3__tables.c
 */

#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdint.h>
#include <time.h>
#include <limits.h>

#include "nist_v3.h"





/*
 * NIST Specified Functions
 */

/* Init - most of the code is implemented later and these functions just act as wrappers */
HashReturn Init( hashState *state, int hashbitlen ) {

	HashReturn init_result;
	u64 secret_key[ SECRET_KEY__64_BIT_WORDS ];

	secret_key[ 0 ] = 0;	
	secret_key[ 1 ] = 0;	
	secret_key[ 2 ] = 0;	
	secret_key[ 3 ] = 0;	

	/* The standard round numbers change according to how many bits of digest output are required */
	if ( hashbitlen <= 512 ) {
		init_result = do__init__hash_state( state, hashbitlen, CENTRE_ROUNDS__512_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, sbox_0 );
	}

	if ( hashbitlen > 512 && hashbitlen <= 1024 ) {
		init_result = do__init__hash_state( state, hashbitlen, CENTRE_ROUNDS__1024_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, sbox_0 );
	}

	if ( hashbitlen > 1024 && hashbitlen <= 2048 ) {
		init_result = do__init__hash_state( state, hashbitlen, CENTRE_ROUNDS__2048_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, sbox_0 );
	}	

	return( init_result );

}


/* Update - most of the code is implemented later and these functions just act as wrappers */
HashReturn Update( hashState *state, const BitSequence *data, DataLength databitlen ) {

	HashReturn update_result;
	
	update_result = do__update__hash_state( state, data, databitlen, mds_8x8s_0, mds_16x8s_lhs_0, mds_16x8s_rhs_0, sbox_0 );

	return( update_result );

}


/* Final - most of the code is implemented later and these functions just act as wrappers */
HashReturn Final( hashState *state, BitSequence *hashval ) {

	HashReturn finalise_result;

	finalise_result = do__finalise__hash_state( state, hashval, mds_8x8s_0, mds_16x8s_lhs_0, mds_16x8s_rhs_0, sbox_0 );

	return( finalise_result );

}


/* Hash - most of the code is implemented later and these functions just act as wrappers */
HashReturn Hash( int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval ) {

	u64 secret_key[ SECRET_KEY__64_BIT_WORDS ];
	HashReturn quick_hash_result;

	secret_key[ 0 ] = 0;	
	secret_key[ 1 ] = 0;	
	secret_key[ 2 ] = 0;	
	secret_key[ 3 ] = 0;	

	/* The standard round numbers change according to how many bits of digest output are required */
	if ( hashbitlen <= 512 ) {
		quick_hash_result = do__quick__hash( hashbitlen, data, databitlen, hashval, CENTRE_ROUNDS__512_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, mds_8x8s_0, mds_16x8s_lhs_0, mds_16x8s_rhs_0, sbox_0 );
	}

	if ( hashbitlen > 512 && hashbitlen <= 1024 ) {
		quick_hash_result = do__quick__hash( hashbitlen, data, databitlen, hashval, CENTRE_ROUNDS__1024_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, mds_8x8s_0, mds_16x8s_lhs_0, mds_16x8s_rhs_0, sbox_0 );
	}

	if ( hashbitlen > 1024 && hashbitlen <= 2048 ) {
		quick_hash_result = do__quick__hash( hashbitlen, data, databitlen, hashval, CENTRE_ROUNDS__2048_BITS, PRINCIPLE_KEY_ROUNDS__1_ROUNDS, secret_key, 0LLU, mds_8x8s_0, mds_16x8s_lhs_0, mds_16x8s_rhs_0, sbox_0 );
	}

	return( quick_hash_result );

}





/*
 * Status Output Functions
 */

/* Display the state buffer byte-by-byte (note endianness will be opposite to word display) */
void do__display_state_buffer_bytewise( u8 state_array[ SGAIL__STATE__SIZE ] ) {

	u8 loop_counter_row, loop_counter_column;

	for ( loop_counter_row = 0; loop_counter_row < SGAIL__STATE__DIMENSION; loop_counter_row++ ) {

		if ( ( loop_counter_row % 4 ) == 0 ) printf("\n");
		printf("Row %2d (byte-wise) :  ", loop_counter_row );

		for ( loop_counter_column = 0; loop_counter_column < SGAIL__STATE__DIMENSION; loop_counter_column++ ) {

			if ( ( loop_counter_column % 8 ) == 0 && loop_counter_column != 0 ) printf( " " );
			printf( "%02x ", state_array[ ( loop_counter_row * SGAIL__STATE__DIMENSION ) + loop_counter_column ] );

		}

		printf("\n" );

	}

	printf("\n\n");

}


/* Display the input buffer byte-by-byte (note endianness will be opposite to word display) */
void do__display_input_block_bytewise( u8 input_block[ SGAIL__INPUT_BLOCK__SIZE ] ) {

	u8 loop_counter_row, loop_counter_column;

	for ( loop_counter_row = 0; loop_counter_row < SGAIL__INPUT_BLOCK__ROWS; loop_counter_row++ ) {

		if ( ( loop_counter_row % 4 ) == 0 ) printf("\n");
		printf("Row %2d (byte-wise) :  ", loop_counter_row );

		for ( loop_counter_column = 0; loop_counter_column < SGAIL__INPUT_BLOCK__COLUMNS; loop_counter_column++ ) {

			if ( ( loop_counter_column % 8  && loop_counter_column != 0) == 0 ) printf( " " );
			if ( ( loop_counter_column % 16  && loop_counter_column != 0) == 0 ) printf( " " );
			printf( "%02x ", input_block[ ( loop_counter_row * SGAIL__INPUT_BLOCK__ROWS ) + loop_counter_column ] );

		}

		printf("\n" );

	}

	printf("\n\n");

}



/* Display the state buffer as 64-bit word representation */
void do__display_state_buffer_64bit_words( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	u32 loop_counter_row;

	for ( loop_counter_row = 0; loop_counter_row < ( SGAIL__NUM_64_BIT_WORDS >> 1 ); loop_counter_row++ ) {

		if ( ( loop_counter_row % 8 ) == 0 ) printf("\n");
		printf("Row %2d (64bit-words) :  %ll016x  %ll016x\n", loop_counter_row, state_array[ loop_counter_row << 1 ], state_array[ ( loop_counter_row << 1 ) + 1 ] );

	}

	printf("\n\n");

}


/* Display the input buffer as 64-bit word representation */
void do__display_input_block_64bit_words( u64 input_block[ SGAIL__NUM_64_BIT_WORDS__INPUT_BLOCK ] ) {

	u32 loop_counter_row;

	for ( loop_counter_row = 0; loop_counter_row < ( SGAIL__NUM_64_BIT_WORDS__INPUT_BLOCK >> 2 ); loop_counter_row++ ) {

		if ( ( loop_counter_row % 8 ) == 0 ) printf("\n");
		printf("Row %2d (64bit-words) :  %ll016x  %ll016x   %ll016x  %ll016x\n", loop_counter_row, input_block[ loop_counter_row << 2 ], input_block[ ( loop_counter_row << 2 ) + 1 ], input_block[ ( loop_counter_row << 2 ) + 2 ], input_block[ ( loop_counter_row << 2 ) + 3 ] );

	}

	printf("\n\n");

}

void do__display_224_bit_hash__byte_wise( u8 digest_result[ DIGEST__224_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n224-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__224_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}


void do__display_256_bit_hash__byte_wise( u8 digest_result[ DIGEST__256_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n256-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__256_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_384_bit_hash__byte_wise( u8 digest_result[ DIGEST__384_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n384-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__384_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_512_bit_hash__byte_wise( u8 digest_result[ DIGEST__512_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n512-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__512_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_768_bit_hash__byte_wise( u8 digest_result[ DIGEST__768_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n768-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__768_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_1024_bit_hash__byte_wise( u8 digest_result[ DIGEST__1024_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n1024-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__1024_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_1536_bit_hash__byte_wise( u8 digest_result[ DIGEST__1536_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n1536-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__1536_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}

void do__display_2048_bit_hash__byte_wise( u8 digest_result[ DIGEST__2048_BITS__BYTE_LENGTH ] ) {

	u32 loop_counter;

	printf("\n2048-bit digest (byte-wise) : ", loop_counter );
	for ( loop_counter = 0; loop_counter < DIGEST__2048_BITS__BYTE_LENGTH; loop_counter++ ) {

		if ( ( loop_counter % 8 ) == 0 && loop_counter != 0 ) printf(" ");
		printf("%02x ", digest_result[ loop_counter ] );

	}

	printf("\n");

}


void do__display_secret_key( u64 secret_key[ SECRET_KEY__64_BIT_WORDS ] ) {

	printf("\nSecret key: %ll016x %ll016x %ll016x %ll016x\n\n", secret_key[ 0 ], secret_key[ 1 ], secret_key[ 2 ], secret_key[ 3 ] );

}

void do__display_preliminary_key( u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ] ) {

	printf("\nPreliminary key: %ll016x %ll016x %ll016x %ll016x\n", preliminary_key[ 0 ], preliminary_key[ 1 ], preliminary_key[ 2 ], preliminary_key[ 3 ] );
	printf("                 %ll016x %ll016x %ll016x %ll016x\n\n", preliminary_key[ 4 ], preliminary_key[ 5 ], preliminary_key[ 6 ], preliminary_key[ 7 ] );

}


/* To test the formatting of output functions is done correctly to console */
void do__display_test_states( ) {

	u8 state_array[ SGAIL__STATE__SIZE ];
	u8 input_block[ SGAIL__INPUT_BLOCK__SIZE ];
	u32 loop_counter;

	printf("\n\n\n");

	for ( loop_counter = 0; loop_counter < SGAIL__STATE__SIZE; loop_counter++ ) {

		state_array[ loop_counter ] = loop_counter;

	}

	for ( loop_counter = 0; loop_counter < SGAIL__INPUT_BLOCK__SIZE; loop_counter++ ) {

		input_block[ loop_counter ] = loop_counter & 0xff;

	}

	do__display_state_buffer_bytewise( state_array );
	do__display_state_buffer_64bit_words( (u64 *)state_array );
	do__display_input_block_bytewise( input_block );
	do__display_input_block_64bit_words( (u64 * )input_block );

	printf("\n\n");

}


/* Output the contents of a minibox to console */
void do__display_minibox( u8 minibox[ MINIBOX__SIZE ]  ) {

	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < MINIBOX__SIZE; loop_counter++ ) {

		printf("%02x : ", minibox[ loop_counter ] );		

	}

	printf("\n");

}





/*
 * Finite Field Arithmetic
 */

/* Multiply a . b in GF(2^8) finite field with specified reduction polynomial */
/* Algorithm obtained from : http://en.wikipedia.org/wiki/Finite_field_arithmetic */
u8 get__finite_field_multiply( u8 element_a, u8 element_b, u8 reduction_polynomial ) {

        u8 element_result = 0;
        u8 loop_counter;
        u8 hi_bit_set;

        for ( loop_counter = 0; loop_counter < 8; loop_counter++) {

                if ( ( element_b & 1 ) == 1 ) element_result ^= element_a;
                hi_bit_set = ( element_a & 0x80 );
                element_a <<= 1;
                if ( hi_bit_set == 0x80 ) element_a ^= reduction_polynomial;
                element_b >>= 1;

        }

        return element_result;

}


/* Create a lookup table for GF(2^8) finite field multiplication into ff_array, using specified reduction polynomial */
void do__create_finite_field_multiply_lut( u8 reduction_polynomial, u8 ff_array[ GF__SIZE ][ GF__SIZE ] ) {

	u8 ff_multiply_result = 0;
	u32 loop_counter_a, loop_counter_b;

	for ( loop_counter_a = 0; loop_counter_a < GF__SIZE; loop_counter_a++ ) {

        	for ( loop_counter_b = 0; loop_counter_b < GF__SIZE; loop_counter_b++ ) {

			ff_multiply_result = get__finite_field_multiply( loop_counter_a, loop_counter_b, reduction_polynomial );
			ff_array[ loop_counter_a ][ loop_counter_b ] = ff_multiply_result;

         	}

	}

}


/* Verify that a ff_array conforms to some basic conditions: (a . b) = (b . a); (a . b) = 0 <=> a = 0 | b= 0; (a . 1) = 1 */
validStatus get__finite_field_lut_verification( u8 ff_array[ GF__SIZE ][ GF__SIZE ] ) {

	u32 outer_loop_counter, inner_loop_counter;
	validStatus is_valid = VS__VALID;
	u8 a_dot_b, b_dot_a;

	for ( outer_loop_counter = 0; outer_loop_counter < GF__SIZE; outer_loop_counter++ ) {

		for ( inner_loop_counter = 0; inner_loop_counter < GF__SIZE; inner_loop_counter++ ) {

			a_dot_b = ff_array[ outer_loop_counter ][ inner_loop_counter ];
			b_dot_a = ff_array[ inner_loop_counter ][ outer_loop_counter ];

			/* (a . b) = (b . a) */
			if ( a_dot_b != b_dot_a ) is_valid = VS__NOT_VALID;

			/* (a . b) => a = 0 | b = 0 */
			if ( a_dot_b == 0 & outer_loop_counter != 0 & inner_loop_counter != 0  ) is_valid = VS__NOT_VALID;

			/* a = 0 => (a . b) = 0 */
			if ( outer_loop_counter == 0 & a_dot_b != 0 ) is_valid = VS__NOT_VALID;

			/* b = 0 => (a . b) = 0 */
			if ( inner_loop_counter == 0 & a_dot_b != 0 ) is_valid = VS__NOT_VALID;

			/* (a . 1) = 1 */
			if ( inner_loop_counter == 1 & a_dot_b != outer_loop_counter ) is_valid = VS__NOT_VALID;

		}

	}

	return( is_valid );

}



/* For a given element ff_element in the finite field ff_array, return its inverse */
u8 get__inverse_finite_field_element( u8 ff_element, u8 ff_array[ GF__SIZE ][ GF__SIZE ] ) {

	/* CAREFUL! - there is no error checking in this code */

	u8 ff_inverse, ff_current;
	u32 loop_counter;

	ff_inverse = 0;
	for ( loop_counter = 0; loop_counter < GF__SIZE; loop_counter++ ) {

		ff_current = ff_array[ ff_element ][ loop_counter ];
		if ( ff_current == 1 ) ff_inverse = loop_counter; 

	}

	return( ff_inverse );

}


/* Code to check sanity on finding inverse of finite field element */
validStatus get__finite_field_verify_inverse_element( u8 ff_array[ GF__SIZE ][ GF__SIZE ] ) {

	u32 loop_counter, inner_loop_counter;
	validStatus is_valid = VS__VALID;
	u8 inv_element;	

	for ( loop_counter = 0; loop_counter < GF__SIZE; loop_counter++ ) {

		inv_element = get__inverse_finite_field_element( loop_counter, ff_array );

		/* Check that ( a ) . ( a^-1 ) = 1 */
		if ( loop_counter != 0 & inv_element != 0 & ff_array[ inv_element ][ loop_counter ] != 1 ) is_valid = VS__NOT_VALID;

	}

	return( is_valid );

}




/*
 * Misc Calculations
 */

u32 do__get_dot_product( u32 in_a, u32 in_b ) {

	u32 loop_counter, result;

	result = 0;

	for ( loop_counter = 0; loop_counter < 31; loop_counter++ ) {

		result ^= ( in_a & in_b ) >> loop_counter;

	}

	return( result );

}


u32 get__byte_hamming_weight__word( u64 state_word__f ) {

	u32 hamming_weight = 0;

	if ( ( state_word__f & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 8 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 16 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 24 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 32 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 40 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 48 ) & 0xFF ) != 0 ) hamming_weight++;
	if ( ( ( state_word__f >> 56 ) & 0xFF ) != 0 ) hamming_weight++;

	return( hamming_weight );

}


u32 get__byte_hamming_weight__quad( u64 quad_array[ SGAIL__NUM_64_BIT_WORDS__QUAD ] ) {

	u32 hamming_weight = 0;
	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS__QUAD; loop_counter++ ) {

		hamming_weight += get__byte_hamming_weight__word( quad_array[ loop_counter ] );

	}

	return( hamming_weight );

}


u32 get__byte_hamming_weight__state( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	u32 hamming_weight = 0;
	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		hamming_weight += get__byte_hamming_weight__word( state_array[ loop_counter ] );

	}

	return( hamming_weight );

}


u32 get__bit_hamming_weight__word( u64 state_word__f ) {

	u32 loop_counter;
	u32 hamming_weight = 0;

	for ( loop_counter = 0; loop_counter < 64; loop_counter++ ) {

		if ( ( ( state_word__f >> loop_counter ) & 0x01 ) == 1 ) hamming_weight++;

	}

	return( hamming_weight );

}


u32 get__bit_hamming_weight__quad( u64 quad_array[ SGAIL__NUM_64_BIT_WORDS__QUAD ] ) {

	u32 hamming_weight = 0;
	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS__QUAD; loop_counter++ ) {

		hamming_weight += get__bit_hamming_weight__word( quad_array[ loop_counter ] );

	}

	return( hamming_weight );

}


u32 get__bit_hamming_weight__state( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	u32 hamming_weight = 0;
	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		hamming_weight += get__bit_hamming_weight__word( state_array[ loop_counter ] );

	}

	return( hamming_weight );

}





/*
 * Cauchy Matrix Generation & MDS Matrix Code
 */

/* Generate a cauchy matrix of size n based on two vectors of length n, and return its cauchy matrix (if its exists) */
internalErrorStatus do__generate_cauchy_matrix( u32 n_size, const u8 *f_array, const u8 *g_array, u8 *cauchy_matrix, u8 ff_array[ GF__SIZE ][ GF__SIZE ] ) {

	u32 loop_i, loop_j;
	internalErrorStatus is_valid;

	/* First, do some sanity checking on F and G arrays */
	is_valid = IES__SUCCESS;
	for ( loop_i = 0; loop_i < n_size; loop_i++ ) {

		for ( loop_j = 0; loop_j < n_size; loop_j++ ) {

			/* make sure no F[i] + G[j] == 0 */
			if ( ( f_array[ loop_i ] ^ g_array[ loop_j ] ) == 0 ) {
				printf("Invalid f,g entries at %3u, %3u : [ %3u, %3u ] = %3u\n", loop_i, loop_j, f_array[loop_i], g_array[loop_j], f_array[ loop_i ] ^ g_array[ loop_j ] );
				is_valid = IES__GENERAL_ERROR;
			}

			/* make sure no F[i] == F[j] */
			if ( loop_i != loop_j && ( f_array[ loop_i ] ^ f_array[ loop_j ] ) == 0 ) {
				printf("Invalid f, f entries at %3u, %3u : [ %3u, %3u ] = %3u\n", loop_i, loop_j, f_array[loop_i], f_array[loop_j], f_array[ loop_i ] ^ f_array[ loop_j ] );
				is_valid = IES__GENERAL_ERROR;				
			}

			/* make sure no G[i] == G[j] */
			if ( loop_i != loop_j && ( g_array[ loop_i ] ^ g_array[ loop_j ] ) == 0 ) {
				printf("Invalid g, g entries at %3u, %3u : [ %3u, %3u ] = %3u\n", loop_i, loop_j, g_array[loop_i], g_array[loop_j], g_array[ loop_i ] ^ g_array[ loop_j ] );
				is_valid = IES__GENERAL_ERROR;				
			}


		}

	}

	/* Calculate each matrix element in turn as ( F[i] + G[i] )^-1 */
	if ( is_valid == IES__SUCCESS ) {

		for ( loop_i = 0; loop_i < n_size; loop_i++ ) {

			for ( loop_j = 0; loop_j < n_size; loop_j++ ) {
			
				cauchy_matrix[ ( loop_i * n_size ) + loop_j ] = get__inverse_finite_field_element( f_array[ loop_i ] ^ g_array[ loop_j ], ff_array );

			} 

		}

	} 

	return( is_valid );

}


/* Verify that the sbox is sane */
validStatus get__sbox_verification( const u8 sbox[ SBOX__SIZE ] ) {

	u32 outer_loop_counter, inner_loop_counter;
	validStatus is_valid = VS__VALID;
	u32 have_found_item;

	for ( outer_loop_counter = 0; outer_loop_counter < SBOX__SIZE; outer_loop_counter++ ) {

		have_found_item = 0;

		for ( inner_loop_counter = 0; inner_loop_counter < SBOX__SIZE; inner_loop_counter++ ) {

			if ( sbox[ inner_loop_counter ] == outer_loop_counter ) have_found_item = 1;

		}

		if ( have_found_item == 0 ) is_valid = VS__NOT_VALID;

	}

	return( is_valid );

}


/* Create the lookup table that is used for the fast sbox and 8x8 mds calculations */
void do__prepare_mds_8x8s_table( u8 mds_matrix[ MDS__64BIT__SIZE ][ MDS__64BIT__SIZE ], u8 ff_array[ GF__SIZE ][ GF__SIZE ], u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] ) {

	u32 row_loop_counter, col_loop_counter, sbox_loop_counter;
	u8  mds_8byte[ MDS__64BIT__SIZE ][ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ];		/* [ 8 ][ 8 ][ 256 ] */
	u64 mds_result;
	u8  mds_entry, ff_result, sbox_result;

	for ( sbox_loop_counter = 0; sbox_loop_counter < SBOX__SIZE; sbox_loop_counter++ ) {

		for ( col_loop_counter = 0; col_loop_counter < MDS__64BIT__SIZE; col_loop_counter++ ) {

			for ( row_loop_counter = 0; row_loop_counter < MDS__64BIT__SIZE; row_loop_counter++ ) {

				mds_entry = mds_matrix[ row_loop_counter ][ col_loop_counter ];
				sbox_result = sbox_set[ row_loop_counter ][ sbox_loop_counter ]; /* THIS IS DIFFERENT (using row rather than column) FROM NORMAL - SEE DOCUMENTATION */
				ff_result = ff_array[ sbox_result ][ mds_entry ];
				mds_8byte[ row_loop_counter ][ col_loop_counter ][ sbox_loop_counter ] = ff_result;

			}

		}

	}

	for ( col_loop_counter = 0; col_loop_counter < MDS__64BIT__SIZE; col_loop_counter++ ) {

		for ( sbox_loop_counter = 0; sbox_loop_counter < SBOX__SIZE; sbox_loop_counter++ ) {

			mds_result = 0;
			mds_result ^= mds_8byte[ 0 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 1 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 2 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 3 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 4 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 5 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 6 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= mds_8byte[ 7 ][ col_loop_counter ][ sbox_loop_counter ];

			mds_8x8s[ col_loop_counter ][ sbox_loop_counter ] = mds_result ^ ROTL_W( mds_result, MDS__64BIT__ROTATE, WORD_BITS_64, WORD_MODULUS_64 );

		}

	}

}


/* Create the lookup table that is used for the fast sbox and 16x16 mds calculations */
void do__prepare_mds_16x8s_table( u8 mds_matrix[ MDS__128BIT__SIZE ][ MDS__128BIT__SIZE ], u8 ff_array[ GF__SIZE ][ GF__SIZE ], u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] ) {

	u32 row_loop_counter, col_loop_counter, sbox_loop_counter;
	u8  u8_mds_8bit_16byte[ MDS__128BIT__SIZE ][ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE  ];		/* [ 16 ][ 8 ][ 65536 ] */
	u64 mds_result;
	u8  mds_entry, ff_result, sbox_result;

	for ( sbox_loop_counter = 0; sbox_loop_counter < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; sbox_loop_counter++ ) {

		for ( col_loop_counter = 0; col_loop_counter < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; col_loop_counter++ ) {

			for ( row_loop_counter = 0; row_loop_counter < MDS__128BIT__SIZE; row_loop_counter++ ) {

				mds_entry = mds_matrix[ row_loop_counter ][ col_loop_counter ];
				sbox_result = sbox_set[ row_loop_counter + MDS__64BIT__SIZE ][ sbox_loop_counter ]; /* THIS IS DIFFERENT FROM NORMAL - SEE DOCUMENTATION */
				ff_result = ff_array[ sbox_result ][ mds_entry ];
				u8_mds_8bit_16byte[ row_loop_counter ][ col_loop_counter ][ sbox_loop_counter ] = ff_result;

			}

		}

	}

	for ( col_loop_counter = 0; col_loop_counter < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; col_loop_counter++ ) {

		for ( sbox_loop_counter = 0; sbox_loop_counter < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; sbox_loop_counter++ ) {

			mds_result = 0;
			mds_result ^= u8_mds_8bit_16byte[ 0 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 1 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 2 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 3 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 4 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 5 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 6 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 7 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_16x8s_lhs[ col_loop_counter ][ sbox_loop_counter ] = mds_result ^ ROTL_W( mds_result, MDS__128BIT__ROTATE_LHS, WORD_BITS_64, WORD_MODULUS_64 );

			mds_result = 0;
			mds_result ^= u8_mds_8bit_16byte[ 8 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 9 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 10 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 11 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 12 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 13 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 14 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 15 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_16x8s_rhs[ col_loop_counter ][ sbox_loop_counter ] = mds_result ^ ROTL_W( mds_result, MDS__128BIT__ROTATE_RHS, WORD_BITS_64, WORD_MODULUS_64 );

		}

	}

}


/* This prepares a test table - there's no sbox lookup so can be used to check mds calcs and for testing diffusion from other functions */
void do__prepare_test_mds_16x8s_table( u8 mds_matrix[ MDS__128BIT__SIZE ][ MDS__128BIT__SIZE ], u8 ff_array[ GF__SIZE ][ GF__SIZE ], u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] ) {

	u32 row_loop_counter, col_loop_counter, sbox_loop_counter;
	u8  u8_mds_8bit_16byte[ MDS__128BIT__SIZE ][ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE  ];		/* [ 16 ][ 8 ][ 65536 ] */
	u64 mds_result;
	u8  mds_entry, ff_result, sbox_result;

	for ( sbox_loop_counter = 0; sbox_loop_counter < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; sbox_loop_counter++ ) {

		for ( col_loop_counter = 0; col_loop_counter < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; col_loop_counter++ ) {

			for ( row_loop_counter = 0; row_loop_counter < MDS__128BIT__SIZE; row_loop_counter++ ) {

				mds_entry = mds_matrix[ row_loop_counter ][ col_loop_counter ];
//				sbox_result = sbox_set[ row_loop_counter + MDS__64BIT__SIZE ][ sbox_loop_counter ];
				sbox_result = sbox_loop_counter;
				ff_result = ff_array[ sbox_result ][ mds_entry ];
				u8_mds_8bit_16byte[ row_loop_counter ][ col_loop_counter ][ sbox_loop_counter ] = ff_result;

			}

		}

	}

	for ( col_loop_counter = 0; col_loop_counter < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; col_loop_counter++ ) {

		for ( sbox_loop_counter = 0; sbox_loop_counter < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; sbox_loop_counter++ ) {

			/* Be careful to preserve endianess here (i.e. decrement byte index rather than increment) */
			/* Will need to re-check endianess as using 16-bit words as well now */
			mds_result = 0;
			mds_result ^= u8_mds_8bit_16byte[ 0 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 1 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 2 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 3 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 4 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 5 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 6 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 7 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_16x8s_lhs[ col_loop_counter ][ sbox_loop_counter ] = mds_result ^ ROTL_W( mds_result, MDS__128BIT__ROTATE_LHS, WORD_BITS_64, WORD_MODULUS_64 );

			mds_result = 0;
			mds_result ^= u8_mds_8bit_16byte[ 8 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 9 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 10 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 11 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 12 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 13 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 14 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_result = mds_result << 8;
			mds_result ^= u8_mds_8bit_16byte[ 15 ][ col_loop_counter ][ sbox_loop_counter ];
			mds_16x8s_rhs[ col_loop_counter ][ sbox_loop_counter ] = mds_result ^ ROTL_W( mds_result, MDS__128BIT__ROTATE_RHS, WORD_BITS_64, WORD_MODULUS_64 );

			/* ******************************* possibly add 128-bit rotate ? *********/

		}

	}

}


/* Fast sbox & MDS Code using lookup tables defined above */
void do__single_mds_8x8s( u8 input_vector[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ], u64 output_vector[ 1 ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] ) {

	u64 mds_result;

	mds_result = 0;
	mds_result ^= mds_8x8s[ 0 ][ input_vector[ 0 ] ];
	mds_result ^= mds_8x8s[ 1 ][ input_vector[ 1 ] ];
	mds_result ^= mds_8x8s[ 2 ][ input_vector[ 2 ] ];
	mds_result ^= mds_8x8s[ 3 ][ input_vector[ 3 ] ];
	mds_result ^= mds_8x8s[ 4 ][ input_vector[ 4 ] ];
	mds_result ^= mds_8x8s[ 5 ][ input_vector[ 5 ] ];
	mds_result ^= mds_8x8s[ 6 ][ input_vector[ 6 ] ];
	mds_result ^= mds_8x8s[ 7 ][ input_vector[ 7 ] ];
	output_vector[ 0 ] = mds_result;

}


void do__single_mds_16x8s( u8 input_vector[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ], u64 output_vector[ 2 ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] ) {

	u64 mds_result_lhs;
	u64 mds_result_rhs;

	mds_result_lhs = 0;
	mds_result_lhs ^= mds_16x8s_lhs[ 0 ][ input_vector[ 0 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 1 ][ input_vector[ 1 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 2 ][ input_vector[ 2 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 3 ][ input_vector[ 3 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 4 ][ input_vector[ 4 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 5 ][ input_vector[ 5 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 6 ][ input_vector[ 6 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 7 ][ input_vector[ 7 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 8 ][ input_vector[ 8 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 9 ][ input_vector[ 9 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 10 ][ input_vector[ 10 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 11 ][ input_vector[ 11 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 12 ][ input_vector[ 12 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 13 ][ input_vector[ 13 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 14 ][ input_vector[ 14 ] ];
	mds_result_lhs ^= mds_16x8s_lhs[ 15 ][ input_vector[ 15 ] ];
	output_vector[ 0 ] = mds_result_lhs;

	mds_result_rhs = 0;
	mds_result_rhs ^= mds_16x8s_rhs[ 0 ][ input_vector[ 0 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 1 ][ input_vector[ 1 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 2 ][ input_vector[ 2 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 3 ][ input_vector[ 3 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 4 ][ input_vector[ 4 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 5 ][ input_vector[ 5 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 6 ][ input_vector[ 6 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 7 ][ input_vector[ 7 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 8 ][ input_vector[ 8 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 9 ][ input_vector[ 9 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 10 ][ input_vector[ 10 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 11 ][ input_vector[ 11 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 12 ][ input_vector[ 12 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 13 ][ input_vector[ 13 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 14 ][ input_vector[ 14 ] ];
	mds_result_rhs ^= mds_16x8s_rhs[ 15 ][ input_vector[ 15 ] ];
	output_vector[ 1 ] = mds_result_rhs;

}


/* Do the sbox and mds on all rows of the state matrix, accepts a key which is xor'ed in first */
void do__full_mds_state_update( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 out_state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 key_array[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] ) {

	do__xor_key_with_state( state_array, key_array );

	/* ------[ Do mds 16x8 ]------ */
	do__single_mds_16x8s( (u8 *)&state_array[ 0 ], &out_state_array[ 0 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 2 ], &out_state_array[ 2 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 4 ], &out_state_array[ 4 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 6 ], &out_state_array[ 6 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 8 ], &out_state_array[ 8 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 10 ], &out_state_array[ 10 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 12 ], &out_state_array[ 12 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 14 ], &out_state_array[ 14 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 16 ], &out_state_array[ 16 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 18 ], &out_state_array[ 18 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 20 ], &out_state_array[ 20 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 22 ], &out_state_array[ 22 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 24 ], &out_state_array[ 24 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 26 ], &out_state_array[ 26 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 28 ], &out_state_array[ 28 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 30 ], &out_state_array[ 30 ], mds_16x8s_lhs, mds_16x8s_rhs );

}



/* Do the sbox and mds on all rows of the state matrix, this version doesn't use a key */
void do__full_mds_state_update__no_key( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 out_state_array[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] ) {

	/* ------[ Do mds 16x8 ]------ */
	do__single_mds_16x8s( (u8 *)&state_array[ 0 ], &out_state_array[ 0 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 2 ], &out_state_array[ 2 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 4 ], &out_state_array[ 4 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 6 ], &out_state_array[ 6 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 8 ], &out_state_array[ 8 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 10 ], &out_state_array[ 10 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 12 ], &out_state_array[ 12 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 14 ], &out_state_array[ 14 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 16 ], &out_state_array[ 16 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 18 ], &out_state_array[ 18 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 20 ], &out_state_array[ 20 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 22 ], &out_state_array[ 22 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 24 ], &out_state_array[ 24 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 26 ], &out_state_array[ 26 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 28 ], &out_state_array[ 28 ], mds_16x8s_lhs, mds_16x8s_rhs );
	do__single_mds_16x8s( (u8 *)&state_array[ 30 ], &out_state_array[ 30 ], mds_16x8s_lhs, mds_16x8s_rhs );

}






/*
 * Global Diffisuion Primitives
 */

/* Takes quadrant 0 (512-bits, 8x64bit words) and applies some diffusion; then rotates and xor's over other 3 quadrants */
void do__quad_diffuse__q0( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	/* Q0 */
	state_array[ 0 ] += ROTL_W( state_array[ 0 ] ^ state_array[ 14 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 2 ] += ROTL_W( state_array[ 2 ] ^ state_array[ 0 ], QD_0_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 4 ] += ROTL_W( state_array[ 4 ] ^ state_array[ 2 ], QD_0_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 6 ] += ROTL_W( state_array[ 6 ] ^ state_array[ 4 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 8 ] += ROTL_W( state_array[ 8 ] ^ state_array[ 6 ], QD_0_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 10 ] += ROTL_W( state_array[ 10 ] ^ state_array[ 8 ], QD_0_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 12 ] += ROTL_W( state_array[ 12 ] ^ state_array[ 10 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 14 ] += ROTL_W( state_array[ 14 ] ^ state_array[ 12 ], QD_0_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	state_array[ 0 ] ^= ROTL_W( state_array[ 0 ] + state_array[ 14 ], QD_0_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 2 ] ^= ROTL_W( state_array[ 2 ] + state_array[ 0 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 4 ] ^= ROTL_W( state_array[ 4 ] + state_array[ 2 ], QD_0_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 6 ] ^= ROTL_W( state_array[ 6 ] + state_array[ 4 ], QD_0_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 8 ] ^= ROTL_W( state_array[ 8 ] + state_array[ 6 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 10 ] ^= ROTL_W( state_array[ 10 ] + state_array[ 8 ], QD_0_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 12 ] ^= ROTL_W( state_array[ 12 ] + state_array[ 10 ], QD_0_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 14 ] ^= ROTL_W( state_array[ 14 ] + state_array[ 12 ], QD_0_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q1 (xor of Q0) */
	state_array[ 1 ] ^= ROTL_W( state_array[ 0 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 3 ] ^= ROTL_W( state_array[ 2 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 5 ] ^= ROTL_W( state_array[ 4 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 7 ] ^= ROTL_W( state_array[ 6 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 9 ] ^= ROTL_W( state_array[ 8 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 11 ] ^= ROTL_W( state_array[ 10 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 13 ] ^= ROTL_W( state_array[ 12 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 15 ] ^= ROTL_W( state_array[ 14 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q2 (xor of Q0) */
	state_array[ 16 ] ^= ROTL_W( state_array[ 0 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 18 ] ^= ROTL_W( state_array[ 2 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 20 ] ^= ROTL_W( state_array[ 4 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 22 ] ^= ROTL_W( state_array[ 6 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 24 ] ^= ROTL_W( state_array[ 8 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 26 ] ^= ROTL_W( state_array[ 10 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 28 ] ^= ROTL_W( state_array[ 12 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 30 ] ^= ROTL_W( state_array[ 14 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q3 (xor of Q0) */
	state_array[ 17 ] ^= ROTL_W( state_array[ 0 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 19 ] ^= ROTL_W( state_array[ 2 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 21 ] ^= ROTL_W( state_array[ 4 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 23 ] ^= ROTL_W( state_array[ 6 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 25 ] ^= ROTL_W( state_array[ 8 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 27 ] ^= ROTL_W( state_array[ 10 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] ^= ROTL_W( state_array[ 12 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 31 ] ^= ROTL_W( state_array[ 14 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );

}


void do__quad_diffuse__q1( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	/* Q1 */
	state_array[ 1 ] += ROTL_W( state_array[ 1 ] ^ state_array[ 15 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 3 ] += ROTL_W( state_array[ 3 ] ^ state_array[ 1 ], QD_1_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 5 ] += ROTL_W( state_array[ 5 ] ^ state_array[ 3 ], QD_1_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 7 ] += ROTL_W( state_array[ 7 ] ^ state_array[ 5 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 9 ] += ROTL_W( state_array[ 9 ] ^ state_array[ 7 ], QD_1_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 11 ] += ROTL_W( state_array[ 11 ] ^ state_array[ 9 ], QD_1_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 13 ] += ROTL_W( state_array[ 13 ] ^ state_array[ 11 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 15 ] += ROTL_W( state_array[ 15 ] ^ state_array[ 13 ], QD_1_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	state_array[ 1 ] ^= ROTL_W( state_array[ 1 ] + state_array[ 15 ], QD_1_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 3 ] ^= ROTL_W( state_array[ 3 ] + state_array[ 1 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 5 ] ^= ROTL_W( state_array[ 5 ] + state_array[ 3 ], QD_1_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 7 ] ^= ROTL_W( state_array[ 7 ] + state_array[ 5 ], QD_1_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 9 ] ^= ROTL_W( state_array[ 9 ] + state_array[ 7 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 11 ] ^= ROTL_W( state_array[ 11 ] + state_array[ 9 ], QD_1_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 13 ] ^= ROTL_W( state_array[ 13 ] + state_array[ 11 ], QD_1_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 15 ] ^= ROTL_W( state_array[ 15 ] + state_array[ 13 ], QD_1_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );


	/* Q0 (xor of Q1) */
	state_array[ 0 ] ^= ROTL_W( state_array[ 1 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 2 ] ^= ROTL_W( state_array[ 3 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 4 ] ^= ROTL_W( state_array[ 5 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 6 ] ^= ROTL_W( state_array[ 7 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 8 ] ^= ROTL_W( state_array[ 9 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 10 ] ^= ROTL_W( state_array[ 11 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 12 ] ^= ROTL_W( state_array[ 13 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 14 ] ^= ROTL_W( state_array[ 15 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q2 (xor of Q1) */
	state_array[ 16 ] ^= ROTL_W( state_array[ 1 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 18 ] ^= ROTL_W( state_array[ 3 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 20 ] ^= ROTL_W( state_array[ 5 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 22 ] ^= ROTL_W( state_array[ 7 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 24 ] ^= ROTL_W( state_array[ 9 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 26 ] ^= ROTL_W( state_array[ 11 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] ^= ROTL_W( state_array[ 13 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 30 ] ^= ROTL_W( state_array[ 15 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q3 (xor of Q1) */
	state_array[ 17 ] ^= ROTL_W( state_array[ 1 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 19 ] ^= ROTL_W( state_array[ 3 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 21 ] ^= ROTL_W( state_array[ 5 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 23 ] ^= ROTL_W( state_array[ 7 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 25 ] ^= ROTL_W( state_array[ 9 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 27 ] ^= ROTL_W( state_array[ 11 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] ^= ROTL_W( state_array[ 13 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 31 ] ^= ROTL_W( state_array[ 15 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );

}


void do__quad_diffuse__q2( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	/* Q2 */
	state_array[ 16 ] += ROTL_W( state_array[ 16 ] ^ state_array[ 30 ], QD_2_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 18 ] += ROTL_W( state_array[ 18 ] ^ state_array[ 16 ], QD_2_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 20 ] += ROTL_W( state_array[ 20 ] ^ state_array[ 18 ], QD_2_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 22 ] += ROTL_W( state_array[ 22 ] ^ state_array[ 20 ], QD_2_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 24 ] += ROTL_W( state_array[ 24 ] ^ state_array[ 22 ], QD_2_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 26 ] += ROTL_W( state_array[ 26 ] ^ state_array[ 24 ], QD_2_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 28 ] += ROTL_W( state_array[ 28 ] ^ state_array[ 26 ], QD_2_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 30 ] += ROTL_W( state_array[ 30 ] ^ state_array[ 28 ], QD_2_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	state_array[ 16 ] ^= ROTL_W( state_array[ 16 ] + state_array[ 30 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 18 ] ^= ROTL_W( state_array[ 18 ] + state_array[ 16 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 20 ] ^= ROTL_W( state_array[ 20 ] + state_array[ 18 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 22 ] ^= ROTL_W( state_array[ 22 ] + state_array[ 20 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 24 ] ^= ROTL_W( state_array[ 24 ] + state_array[ 22 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 26 ] ^= ROTL_W( state_array[ 26 ] + state_array[ 24 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 28 ] ^= ROTL_W( state_array[ 28 ] + state_array[ 26 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 30 ] ^= ROTL_W( state_array[ 30 ] + state_array[ 28 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );


	/* Q0 (xor of Q2) */
	state_array[ 0 ] ^= ROTL_W( state_array[ 16 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 2 ] ^= ROTL_W( state_array[ 18 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 4 ] ^= ROTL_W( state_array[ 20 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 6 ] ^= ROTL_W( state_array[ 22 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 8 ] ^= ROTL_W( state_array[ 24 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 10 ] ^= ROTL_W( state_array[ 26 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 12 ] ^= ROTL_W( state_array[ 28 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 14 ] ^= ROTL_W( state_array[ 30 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q1 (xor of Q2) */
	state_array[ 1 ] ^= ROTL_W( state_array[ 16 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 3 ] ^= ROTL_W( state_array[ 18 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 5 ] ^= ROTL_W( state_array[ 20 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 7 ] ^= ROTL_W( state_array[ 22 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 9 ] ^= ROTL_W( state_array[ 24 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 11 ] ^= ROTL_W( state_array[ 26 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 13 ] ^= ROTL_W( state_array[ 28 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 15 ] ^= ROTL_W( state_array[ 30 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q3 (xor of Q2) */
	state_array[ 17 ] ^= ROTL_W( state_array[ 16 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 19 ] ^= ROTL_W( state_array[ 18 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 21 ] ^= ROTL_W( state_array[ 20 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 23 ] ^= ROTL_W( state_array[ 22 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 25 ] ^= ROTL_W( state_array[ 24 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 27 ] ^= ROTL_W( state_array[ 26 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] ^= ROTL_W( state_array[ 28 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 31 ] ^= ROTL_W( state_array[ 30 ], QD_X_ROT_3, WORD_BITS_64, WORD_MODULUS_64 );

}



void do__quad_diffuse__q3( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {

	/* Q3 */
	state_array[ 17 ] += ROTL_W( state_array[ 17 ] ^ state_array[ 31 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 19 ] += ROTL_W( state_array[ 19 ] ^ state_array[ 17 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 21 ] += ROTL_W( state_array[ 21 ] ^ state_array[ 19 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 23 ] += ROTL_W( state_array[ 23 ] ^ state_array[ 21 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 25 ] += ROTL_W( state_array[ 25 ] ^ state_array[ 23 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 27 ] += ROTL_W( state_array[ 27 ] ^ state_array[ 25 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] += ROTL_W( state_array[ 29 ] ^ state_array[ 27 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 31 ] += ROTL_W( state_array[ 31 ] ^ state_array[ 29 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	state_array[ 17 ] ^= ROTL_W( state_array[ 17 ] + state_array[ 31 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 19 ] ^= ROTL_W( state_array[ 19 ] + state_array[ 17 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 21 ] ^= ROTL_W( state_array[ 21 ] + state_array[ 19 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 23 ] ^= ROTL_W( state_array[ 23 ] + state_array[ 21 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 25 ] ^= ROTL_W( state_array[ 25 ] + state_array[ 23 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 27 ] ^= ROTL_W( state_array[ 27 ] + state_array[ 25 ], QD_3_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 29 ] ^= ROTL_W( state_array[ 29 ] + state_array[ 27 ], QD_3_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 31 ] ^= ROTL_W( state_array[ 31 ] + state_array[ 29 ], QD_3_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );


	/* Q0 (xor of Q3) */
	state_array[ 0 ] ^= ROTL_W( state_array[ 17 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 2 ] ^= ROTL_W( state_array[ 19 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 4 ] ^= ROTL_W( state_array[ 21 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 6 ] ^= ROTL_W( state_array[ 23 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 8 ] ^= ROTL_W( state_array[ 25 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 10 ] ^= ROTL_W( state_array[ 27 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 12 ] ^= ROTL_W( state_array[ 29 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 14 ] ^= ROTL_W( state_array[ 31 ], QD_X_ROT_0, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q1 (xor of Q3) */
	state_array[ 1 ] ^= ROTL_W( state_array[ 17 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 3 ] ^= ROTL_W( state_array[ 19 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 5 ] ^= ROTL_W( state_array[ 21 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 7 ] ^= ROTL_W( state_array[ 23 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 9 ] ^= ROTL_W( state_array[ 25 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 11 ] ^= ROTL_W( state_array[ 27 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 13 ] ^= ROTL_W( state_array[ 29 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 15 ] ^= ROTL_W( state_array[ 31 ], QD_X_ROT_1, WORD_BITS_64, WORD_MODULUS_64 );

	/* Q2 (xor of Q3) */
	state_array[ 16 ] ^= ROTL_W( state_array[ 17 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 18 ] ^= ROTL_W( state_array[ 19 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 20 ] ^= ROTL_W( state_array[ 21 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 22 ] ^= ROTL_W( state_array[ 23 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 24 ] ^= ROTL_W( state_array[ 25 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 26 ] ^= ROTL_W( state_array[ 27 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 28 ] ^= ROTL_W( state_array[ 29 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );
	state_array[ 30 ] ^= ROTL_W( state_array[ 31 ], QD_X_ROT_2, WORD_BITS_64, WORD_MODULUS_64 );

}



/* Apply Pseudo-Hadammard Transforms across quardant boundaries to globally diffuse */
void do__pht_a_diffuse( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ) {
	
	/* Do Q0 -> Q3 PHTs */
	state_array[ 0 ] += state_array[ 17 ];
	state_array[ 17 ] += state_array[ 0 ];

	state_array[ 2 ] += state_array[ 19 ];
	state_array[ 19 ] += state_array[ 2 ];

	state_array[ 4 ] += state_array[ 21 ];
	state_array[ 21 ] += state_array[ 4 ];

	state_array[ 6 ] += state_array[ 23 ];
	state_array[ 23 ] += state_array[ 6 ];

	state_array[ 8 ] += state_array[ 25 ];
	state_array[ 25 ] += state_array[ 8 ];

	state_array[ 10 ] += state_array[ 27 ];
	state_array[ 27 ] += state_array[ 10 ];

	state_array[ 12 ] += state_array[ 29 ];
	state_array[ 29 ] += state_array[ 12 ];

	state_array[ 14 ] += state_array[ 31 ];
	state_array[ 31 ] += state_array[ 14 ];

	/* Do Q2 -> Q1 PHTs */
	state_array[ 9 ] += state_array[ 16 ];
	state_array[ 16 ] += state_array[ 9 ];

	state_array[ 11 ] += state_array[ 18 ];
	state_array[ 18 ] += state_array[ 11 ];

	state_array[ 13 ] += state_array[ 20 ];
	state_array[ 20 ] += state_array[ 13 ];
	
	state_array[ 15 ] += state_array[ 22 ];
	state_array[ 22 ] += state_array[ 15 ];

	state_array[ 1 ] += state_array[ 24 ];
	state_array[ 24 ] += state_array[ 1 ];

	state_array[ 3 ] += state_array[ 26 ];
	state_array[ 26 ] += state_array[ 3 ];

	state_array[ 5 ] += state_array[ 28 ];
	state_array[ 28 ] += state_array[ 5 ];

	state_array[ 7 ] += state_array[ 30 ];
	state_array[ 30 ] += state_array[ 7 ];

}


/* Second stage of phts */
void do__pht_b_diffuse( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ] ){

	state_array[ 0 ] += state_array[ 3 ];
	state_array[ 3 ] += state_array[ 0 ];

	state_array[ 2 ] += state_array[ 7 ];
	state_array[ 7 ] += state_array[ 2 ];

	state_array[ 4 ] += state_array[ 1 ];
	state_array[ 1 ] += state_array[ 4 ];

	state_array[ 6 ] += state_array[ 5 ];
	state_array[ 5 ] += state_array[ 6 ];


	state_array[ 8 ] += state_array[ 11 ];
	state_array[ 11 ] += state_array[ 8 ];

	state_array[ 10 ] += state_array[ 15 ];
	state_array[ 15 ] += state_array[ 10 ];

	state_array[ 12 ] += state_array[ 9 ];
	state_array[ 9 ] += state_array[ 12 ];

	state_array[ 14 ] += state_array[ 13 ];
	state_array[ 13 ] += state_array[ 14 ];


	state_array[ 16 ] += state_array[ 19 ];
	state_array[ 19 ] += state_array[ 16 ];

	state_array[ 18 ] += state_array[ 23 ];
	state_array[ 23 ] += state_array[ 18 ];

	state_array[ 20 ] += state_array[ 17 ];
	state_array[ 17 ] += state_array[ 20 ];

	state_array[ 22 ] += state_array[ 21 ];
	state_array[ 21 ] += state_array[ 22 ];


	state_array[ 24 ] += state_array[ 27 ];
	state_array[ 27 ] += state_array[ 24 ];

	state_array[ 26 ] += state_array[ 31 ];
	state_array[ 31 ] += state_array[ 26 ];

	state_array[ 28 ] += state_array[ 25 ];
	state_array[ 25 ] += state_array[ 28 ];

	state_array[ 30 ] += state_array[ 29 ];
	state_array[ 29 ] += state_array[ 30 ];

}






/*
 * Permutation Code
 */

/* Create a keyed translation permutation array */
/* xlate_array should be a permutation to begin with, best just passing a memcpy of the sbox */
void do__permutate_xlate_buffer( u8 xlate_array[ SGAIL__STATE__SIZE ], u8 key_array[ SGAIL__STATE__SIZE ], u8 initial_j, const u8 sbox[ SBOX__SIZE ] ) {

	u32 counter_i;
	u8 counter_j, swap_value, s_counter_i;

	counter_j = sbox[ initial_j ];

	for ( counter_i = 0; counter_i < SGAIL__STATE__SIZE; counter_i++ ) {

		counter_j += xlate_array[ counter_j ] + key_array[ counter_i ];
		counter_j = sbox[ counter_j ];

		/* swap x[ s[ i ] ] <-> x[ j ]  (s[i] makes sure all items are passed through swap but in a permutated order) */
		s_counter_i = sbox[ counter_i ];
		swap_value = xlate_array[ s_counter_i ];
		xlate_array[ s_counter_i ] = xlate_array[ counter_j ];
		xlate_array[ counter_j ] = swap_value;

	}

}


/* This passes the state array through the xlate array (i.e. permutates the positions of the state array entries), then applies an MDS afterwards */
void do__xlate_state_mds_8x8s( u8 in_state_array[ SGAIL__STATE__SIZE ], u64 out_state_array[ SGAIL__NUM_64_BIT_WORDS ], u8 xlate_array[ SGAIL__STATE__SIZE ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] ) {

	u32 loop_counter, local_loop_counter;
	u64 mds_result;
	u8  index_0, index_1, index_2, index_3, index_4, index_5, index_6, index_7;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {
	
		/* Below just multiplies loop_counter by 8 to align each 8-byte mds operation correctly */ 
		local_loop_counter = loop_counter << SGAIL__NUM_64_BIT_WORDS__SHIFT;

		/* is faster doing the byte table lookups seperately */
		index_0 = in_state_array[ xlate_array[ local_loop_counter ] ];
		index_1 = in_state_array[ xlate_array[ local_loop_counter + 1 ] ];
		index_2 = in_state_array[ xlate_array[ local_loop_counter + 2 ] ];
		index_3 = in_state_array[ xlate_array[ local_loop_counter + 3 ] ];
		index_4 = in_state_array[ xlate_array[ local_loop_counter + 4 ] ];
		index_5 = in_state_array[ xlate_array[ local_loop_counter + 5 ] ];
		index_6 = in_state_array[ xlate_array[ local_loop_counter + 6 ] ];
		index_7 = in_state_array[ xlate_array[ local_loop_counter + 7 ] ]; 

		mds_result = mds_8x8s[ 0 ][ index_0 ];
		mds_result ^= mds_8x8s[ 1 ][ index_1 ];
		mds_result ^= mds_8x8s[ 2 ][ index_2 ];
		mds_result ^= mds_8x8s[ 3 ][ index_3 ];
		mds_result ^= mds_8x8s[ 4 ][ index_4 ];
		mds_result ^= mds_8x8s[ 5 ][ index_5 ];
		mds_result ^= mds_8x8s[ 6 ][ index_6 ];
		mds_result ^= mds_8x8s[ 7 ][ index_7 ];

		out_state_array[ loop_counter ] = mds_result;

	}

}






/*
 * Key Preperation and Round Key Extraction Functions
 */

/* Preliminary key processing functions */
void do__process_preliminary_key( u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 secret_key[ SECRET_KEY__64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], u64 serial_number, u64 block_count__high_word, u64 block_count__low_word, u64 final_block__bit_count ) {

	u32 loop_counter, local_loop_counter;
	u64 current_word;

	/* Copy stuff into the appropriate places */
	current_word = serial_number;
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 0 ], mds_8x8s );		

	current_word = secret_key[ 0 ] + preliminary_key[ 0 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 1 ], mds_8x8s );		

	current_word = block_count__low_word + preliminary_key[ 1 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 2 ], mds_8x8s );		

	current_word = secret_key[ 1 ] + preliminary_key[ 2 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 3 ], mds_8x8s );		

	current_word = final_block__bit_count + preliminary_key[ 3 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 4 ], mds_8x8s );		

	current_word = secret_key[ 2 ] + preliminary_key[ 4 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 5 ], mds_8x8s );		

	current_word = block_count__high_word + preliminary_key[ 5 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 6 ], mds_8x8s );		

	current_word = secret_key[ 3 ] + preliminary_key[ 6 ];
	do__single_mds_8x8s( (u8 *) &current_word, &preliminary_key[ 7 ], mds_8x8s );		

	preliminary_key[ 0 ] += preliminary_key[ 7 ];
	preliminary_key[ 1 ] += preliminary_key[ 0 ];
	preliminary_key[ 2 ] += preliminary_key[ 1 ];
	preliminary_key[ 3 ] += preliminary_key[ 2 ];
	preliminary_key[ 4 ] += preliminary_key[ 3 ];
	preliminary_key[ 5 ] += preliminary_key[ 4 ];
	preliminary_key[ 6 ] += preliminary_key[ 5 ];
	preliminary_key[ 7 ] += preliminary_key[ 6 ];

}


/* Principle key processing functions */
void do__process_principle_key__single__1_rounds( u64 message_block[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u8 xlate_array__pre[ SGAIL__STATE__SIZE ], u8 xlate_array__post[ SGAIL__STATE__SIZE ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter;
	u64 temp_array[ SGAIL__NUM_64_BIT_WORDS ], temp_array_b[ SGAIL__NUM_64_BIT_WORDS ];

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block\n" );
do__display_state_buffer_64bit_words( message_block );
#endif

	do__xlate_state_mds_8x8s( (u8 *)message_block, temp_array, xlate_array__pre, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after xlate\n" );
do__display_state_buffer_64bit_words( temp_array );
#endif


	temp_array[ 0 ] ^= preliminary_key[ 0 ];
	temp_array[ 2 ] ^= preliminary_key[ 1 ];
	temp_array[ 4 ] ^= preliminary_key[ 2 ];
	temp_array[ 6 ] ^= preliminary_key[ 3 ];
	temp_array[ 8 ] ^= preliminary_key[ 4 ];
	temp_array[ 10 ] ^= preliminary_key[ 5 ];
	temp_array[ 12 ] ^= preliminary_key[ 6 ];
	temp_array[ 14 ] ^= preliminary_key[ 7 ];
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after first xor of preliminary key\n" );
do__display_state_buffer_64bit_words( temp_array );
#endif


	do__pht_a_diffuse( temp_array );
	do__quad_diffuse__q0( temp_array );
	do__pht_b_diffuse( temp_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after phts and quad diffuse\n" );
do__display_state_buffer_64bit_words( temp_array );
#endif

	do__full_mds_state_update__no_key( temp_array, temp_array_b, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after round one full mds update\n" );
do__display_state_buffer_64bit_words( temp_array_b );
#endif

	temp_array_b[ 0 ] ^= preliminary_key[ 0 ];
	temp_array_b[ 2 ] ^= preliminary_key[ 1 ];
	temp_array_b[ 4 ] ^= preliminary_key[ 2 ];
	temp_array_b[ 6 ] ^= preliminary_key[ 3 ];
	temp_array_b[ 8 ] ^= preliminary_key[ 4 ];
	temp_array_b[ 10 ] ^= preliminary_key[ 5 ];
	temp_array_b[ 12 ] ^= preliminary_key[ 6 ];
	temp_array_b[ 14 ] ^= preliminary_key[ 7 ];
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after second xor of preliminary key\n" );
do__display_state_buffer_64bit_words( temp_array_b );
#endif

	do__xlate_state_mds_8x8s( (u8 *)temp_array_b, principle_key, xlate_array__post, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after final xlate\n" );
do__display_state_buffer_64bit_words( principle_key );
#endif

	do__xor_key_with_state( principle_key, message_block );  
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key single 1 round, message block after final xor of original message block\n" );
do__display_state_buffer_64bit_words( principle_key );
#endif

	/* clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		temp_array[ loop_counter ] = 0;		
		temp_array_b[ loop_counter ] = 0;		

	}	

}


void do__process_principle_key__single__2_rounds( u64 message_block[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u8 xlate_array__pre[ SGAIL__STATE__SIZE ], u8 xlate_array__post[ SGAIL__STATE__SIZE ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter;
	u64 temp_array[ SGAIL__NUM_64_BIT_WORDS ], temp_array_b[ SGAIL__NUM_64_BIT_WORDS ];

	do__xlate_state_mds_8x8s( (u8 *)message_block, temp_array, xlate_array__pre, mds_8x8s );

	temp_array[ 0 ] ^= preliminary_key[ 0 ];
	temp_array[ 2 ] ^= preliminary_key[ 1 ];
	temp_array[ 4 ] ^= preliminary_key[ 2 ];
	temp_array[ 6 ] ^= preliminary_key[ 3 ];
	temp_array[ 8 ] ^= preliminary_key[ 4 ];
	temp_array[ 10 ] ^= preliminary_key[ 5 ];
	temp_array[ 12 ] ^= preliminary_key[ 6 ];
	temp_array[ 14 ] ^= preliminary_key[ 7 ];

	do__pht_a_diffuse( temp_array );
	do__quad_diffuse__q0( temp_array );
	do__pht_b_diffuse( temp_array );

	do__full_mds_state_update__no_key( temp_array, temp_array_b, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	temp_array_b[ 0 ] ^= preliminary_key[ 0 ];
	temp_array_b[ 2 ] ^= preliminary_key[ 1 ];
	temp_array_b[ 4 ] ^= preliminary_key[ 2 ];
	temp_array_b[ 6 ] ^= preliminary_key[ 3 ];
	temp_array_b[ 8 ] ^= preliminary_key[ 4 ];
	temp_array_b[ 10 ] ^= preliminary_key[ 5 ];
	temp_array_b[ 12 ] ^= preliminary_key[ 6 ];
	temp_array_b[ 14 ] ^= preliminary_key[ 7 ];

	do__pht_a_diffuse( temp_array_b );
	do__quad_diffuse__q0( temp_array_b );
	do__pht_b_diffuse( temp_array_b );

	do__full_mds_state_update__no_key( temp_array_b, temp_array, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	temp_array[ 0 ] ^= preliminary_key[ 0 ];
	temp_array[ 2 ] ^= preliminary_key[ 1 ];
	temp_array[ 4 ] ^= preliminary_key[ 2 ];
	temp_array[ 6 ] ^= preliminary_key[ 3 ];
	temp_array[ 8 ] ^= preliminary_key[ 4 ];
	temp_array[ 10 ] ^= preliminary_key[ 5 ];
	temp_array[ 12 ] ^= preliminary_key[ 6 ];
	temp_array[ 14 ] ^= preliminary_key[ 7 ];

	do__xlate_state_mds_8x8s( (u8 *)temp_array, principle_key, xlate_array__post, mds_8x8s );

	do__xor_key_with_state( principle_key, message_block );  

	/* clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		temp_array[ loop_counter ] = 0;		
		temp_array_b[ loop_counter ] = 0;		

	}	
	
}


void do__process_principle_key__single__3_rounds( u64 message_block[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u8 xlate_array__pre[ SGAIL__STATE__SIZE ], u8 xlate_array__post[ SGAIL__STATE__SIZE ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter;
	u64 temp_array[ SGAIL__NUM_64_BIT_WORDS ], temp_array_b[ SGAIL__NUM_64_BIT_WORDS ];

	do__xlate_state_mds_8x8s( (u8 *)message_block, temp_array, xlate_array__pre, mds_8x8s );

	temp_array[ 0 ] ^= preliminary_key[ 0 ];
	temp_array[ 2 ] ^= preliminary_key[ 1 ];
	temp_array[ 4 ] ^= preliminary_key[ 2 ];
	temp_array[ 6 ] ^= preliminary_key[ 3 ];
	temp_array[ 8 ] ^= preliminary_key[ 4 ];
	temp_array[ 10 ] ^= preliminary_key[ 5 ];
	temp_array[ 12 ] ^= preliminary_key[ 6 ];
	temp_array[ 14 ] ^= preliminary_key[ 7 ];

	do__pht_a_diffuse( temp_array );
	do__quad_diffuse__q0( temp_array );
	do__pht_b_diffuse( temp_array );

	do__full_mds_state_update__no_key( temp_array, temp_array_b, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	temp_array_b[ 0 ] ^= preliminary_key[ 0 ];
	temp_array_b[ 2 ] ^= preliminary_key[ 1 ];
	temp_array_b[ 4 ] ^= preliminary_key[ 2 ];
	temp_array_b[ 6 ] ^= preliminary_key[ 3 ];
	temp_array_b[ 8 ] ^= preliminary_key[ 4 ];
	temp_array_b[ 10 ] ^= preliminary_key[ 5 ];
	temp_array_b[ 12 ] ^= preliminary_key[ 6 ];
	temp_array_b[ 14 ] ^= preliminary_key[ 7 ];

	do__pht_a_diffuse( temp_array_b );
	do__quad_diffuse__q0( temp_array_b );
	do__pht_b_diffuse( temp_array_b );

	do__full_mds_state_update__no_key( temp_array_b, temp_array, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	temp_array[ 0 ] ^= preliminary_key[ 0 ];
	temp_array[ 2 ] ^= preliminary_key[ 1 ];
	temp_array[ 4 ] ^= preliminary_key[ 2 ];
	temp_array[ 6 ] ^= preliminary_key[ 3 ];
	temp_array[ 8 ] ^= preliminary_key[ 4 ];
	temp_array[ 10 ] ^= preliminary_key[ 5 ];
	temp_array[ 12 ] ^= preliminary_key[ 6 ];
	temp_array[ 14 ] ^= preliminary_key[ 7 ];

	do__pht_a_diffuse( temp_array );
	do__quad_diffuse__q0( temp_array );
	do__pht_b_diffuse( temp_array );

	do__full_mds_state_update__no_key( temp_array, temp_array_b, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	temp_array_b[ 0 ] ^= preliminary_key[ 0 ];
	temp_array_b[ 2 ] ^= preliminary_key[ 1 ];
	temp_array_b[ 4 ] ^= preliminary_key[ 2 ];
	temp_array_b[ 6 ] ^= preliminary_key[ 3 ];
	temp_array_b[ 8 ] ^= preliminary_key[ 4 ];
	temp_array_b[ 10 ] ^= preliminary_key[ 5 ];
	temp_array_b[ 12 ] ^= preliminary_key[ 6 ];
	temp_array_b[ 14 ] ^= preliminary_key[ 7 ];

	do__xlate_state_mds_8x8s( (u8 *)temp_array_b, principle_key, xlate_array__post, mds_8x8s );

	do__xor_key_with_state( principle_key, message_block );  

	/* clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		temp_array[ loop_counter ] = 0;		
		temp_array_b[ loop_counter ] = 0;		

	}	
	
	
}


void do__process_principle_key__pair__1_rounds( u64 message_block__left[ SGAIL__NUM_64_BIT_WORDS ], u64 message_block__right[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key__left[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 preliminary_key__right[ PRELIMINARY_KEY__64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u64 principle_key__left[ SGAIL__NUM_64_BIT_WORDS ], principle_key__right[ SGAIL__NUM_64_BIT_WORDS ];
	u8 xlate_array__left[ SGAIL__STATE__SIZE ], xlate_array__right[ SGAIL__STATE__SIZE ];
	u32 loop_counter;

	memcpy( xlate_array__left, sbox_0, SGAIL__STATE__SIZE );
	memcpy( xlate_array__right, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array__left, (u8 *)message_block__left, 0, sbox_0 );
	do__permutate_xlate_buffer( xlate_array__right, (u8 *)message_block__right, 0, sbox_0 );

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key pair 1 round, xlate left\n" );
do__display_state_buffer_bytewise( xlate_array__left );
printf("\nIn process principle key pair 1 round, xlate right\n" );
do__display_state_buffer_bytewise( xlate_array__right );
#endif

	do__process_principle_key__single__1_rounds( message_block__left, principle_key__left, preliminary_key__left, xlate_array__right, xlate_array__left, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );
	do__process_principle_key__single__1_rounds( message_block__right, principle_key__right, preliminary_key__right, xlate_array__left, xlate_array__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );


	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {
	
		principle_key[ loop_counter ] = principle_key__left[ loop_counter ] + principle_key__right[ loop_counter ];

	}
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn process principle key pair 1 round, final principle key\n" );
do__display_state_buffer_64bit_words( principle_key );
#endif

	/* clean up */
	memset( xlate_array__left, 0, SGAIL__STATE__SIZE );
	memset( xlate_array__right, 0, SGAIL__STATE__SIZE );
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		principle_key__left[ loop_counter ] = 0;
		principle_key__right[ loop_counter ] = 0;

	}

}


void do__process_principle_key__pair__2_rounds( u64 message_block__left[ SGAIL__NUM_64_BIT_WORDS ], u64 message_block__right[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key__left[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 preliminary_key__right[ PRELIMINARY_KEY__64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u64 principle_key__left[ SGAIL__NUM_64_BIT_WORDS ], principle_key__right[ SGAIL__NUM_64_BIT_WORDS ];
	u8 xlate_array__left[ SGAIL__STATE__SIZE ], xlate_array__right[ SGAIL__STATE__SIZE ];
	u32 loop_counter;


	memcpy( xlate_array__left, sbox_0, SGAIL__STATE__SIZE );
	memcpy( xlate_array__right, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array__left, (u8 *)message_block__left, 0, sbox_0 );
	do__permutate_xlate_buffer( xlate_array__right, (u8 *)message_block__right, 0, sbox_0 );

	do__process_principle_key__single__2_rounds( message_block__left, principle_key__left, preliminary_key__left, xlate_array__right, xlate_array__left, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );
	do__process_principle_key__single__2_rounds( message_block__right, principle_key__right, preliminary_key__right, xlate_array__left, xlate_array__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );


	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {
	
		principle_key[ loop_counter ] = principle_key__left[ loop_counter ] + principle_key__right[ loop_counter ];

	}


	/* clean up */
	memset( xlate_array__left, 0, SGAIL__STATE__SIZE );
	memset( xlate_array__right, 0, SGAIL__STATE__SIZE );
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		principle_key__left[ loop_counter ] = 0;
		principle_key__right[ loop_counter ] = 0;

	}
}


void do__process_principle_key__pair__3_rounds( u64 message_block__left[ SGAIL__NUM_64_BIT_WORDS ], u64 message_block__right[ SGAIL__NUM_64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key__left[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 preliminary_key__right[ PRELIMINARY_KEY__64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u64 principle_key__left[ SGAIL__NUM_64_BIT_WORDS ], principle_key__right[ SGAIL__NUM_64_BIT_WORDS ];
	u8 xlate_array__left[ SGAIL__STATE__SIZE ], xlate_array__right[ SGAIL__STATE__SIZE ];
	u32 loop_counter;


	memcpy( xlate_array__left, sbox_0, SGAIL__STATE__SIZE );
	memcpy( xlate_array__right, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array__left, (u8 *)message_block__left, 0, sbox_0 );
	do__permutate_xlate_buffer( xlate_array__right, (u8 *)message_block__right, 0, sbox_0 );

	do__process_principle_key__single__3_rounds( message_block__left, principle_key__left, preliminary_key__left, xlate_array__right, xlate_array__left, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );
	do__process_principle_key__single__3_rounds( message_block__right, principle_key__right, preliminary_key__right, xlate_array__left, xlate_array__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );


	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {
	
		principle_key[ loop_counter ] = principle_key__left[ loop_counter ] + principle_key__right[ loop_counter ];

	}


	/* clean up */
	memset( xlate_array__left, 0, SGAIL__STATE__SIZE );
	memset( xlate_array__right, 0, SGAIL__STATE__SIZE );
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		principle_key__left[ loop_counter ] = 0;
		principle_key__right[ loop_counter ] = 0;

	}


}


/* Extract the round key from principle_key_extract array */
void do__key_extract_x4( u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u32 quadrant, u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 round_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], u32 round_index ) {

	u32 local_round_index_0, local_round_index_1, local_round_index_2, local_round_index_3, local_round_index_4, local_round_index_5, local_round_index_6, local_round_index_7;
	u32 ke_rotate_1, ke_rotate_2, ke_rotate_3;

	local_round_index_0 = round_index & RC_ENTRIES__MASK;
	local_round_index_1 = ( round_index + 1 ) & RC_ENTRIES__MASK;
	local_round_index_2 = ( round_index + 2 ) & RC_ENTRIES__MASK;
	local_round_index_3 = ( round_index + 3 ) & RC_ENTRIES__MASK;
	local_round_index_4 = ( round_index + 4 ) & RC_ENTRIES__MASK;
	local_round_index_5 = ( round_index + 5 ) & RC_ENTRIES__MASK;
	local_round_index_6 = ( round_index + 6 ) & RC_ENTRIES__MASK;
	local_round_index_7 = ( round_index + 7 ) & RC_ENTRIES__MASK;


	ke_rotate_1 = ( KE_ROT_1 + round_index ) & 0x3f;
	ke_rotate_2 = ( KE_ROT_2 + round_index ) & 0x3f;
	ke_rotate_3 = ( KE_ROT_3 + round_index ) & 0x3f;

	switch( quadrant ) {

		case 0:
			round_key[ 0 ] = principle_key[ 0 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 2 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 4 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 6 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 8 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 10 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 12 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 14 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;
		case 1:
			round_key[ 0 ] = principle_key[ 1 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 3 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 5 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 7 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 9 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 11 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 13 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 15 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;
		case 2:
			round_key[ 0 ] = principle_key[ 16 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 18 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 20 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 22 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 24 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 26 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 28 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 30 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;
		case 3:
			round_key[ 0 ] = principle_key[ 17 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 19 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 21 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 23 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 25 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 27 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 29 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 31 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;

	}

	round_key[ 1 ] = ROTL_W( round_key[ 2 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 3 ] = ROTL_W( round_key[ 0 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 5 ] = ROTL_W( round_key[ 6 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 7 ] = ROTL_W( round_key[ 4 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 9 ] = ROTL_W( round_key[ 10 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 11 ] = ROTL_W( round_key[ 8 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 13 ] = ROTL_W( round_key[ 14 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 15 ] = ROTL_W( round_key[ 12 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );

	round_key[ 16 ] = ROTL_W( round_key[ 6 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 18 ] = ROTL_W( round_key[ 4 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 20 ] = ROTL_W( round_key[ 2 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 22 ] = ROTL_W( round_key[ 0 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 24 ] = ROTL_W( round_key[ 14 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 26 ] = ROTL_W( round_key[ 12 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 28 ] = ROTL_W( round_key[ 10 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 30 ] = ROTL_W( round_key[ 8 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );

	round_key[ 17 ] = ROTL_W( round_key[ 14 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 19 ] = ROTL_W( round_key[ 12 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 21 ] = ROTL_W( round_key[ 10 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 23 ] = ROTL_W( round_key[ 8 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 25 ] = ROTL_W( round_key[ 6 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 27 ] = ROTL_W( round_key[ 4 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 29 ] = ROTL_W( round_key[ 2 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 31 ] = ROTL_W( round_key[ 0 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn key extract x4, round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif


}


void do__key_extract_x2( u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u32 quadrant_a, u32 quadrant_b, u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 round_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], u32 round_index ) {

	u32 local_round_index_0, local_round_index_1, local_round_index_2, local_round_index_3, local_round_index_4, local_round_index_5, local_round_index_6, local_round_index_7;
	u32 ke_rotate_1, ke_rotate_2, ke_rotate_3;

	local_round_index_0 = round_index & RC_ENTRIES__MASK;
	local_round_index_1 = ( round_index + 1 ) & RC_ENTRIES__MASK;
	local_round_index_2 = ( round_index + 2 ) & RC_ENTRIES__MASK;
	local_round_index_3 = ( round_index + 3 ) & RC_ENTRIES__MASK;
	local_round_index_4 = ( round_index + 4 ) & RC_ENTRIES__MASK;
	local_round_index_5 = ( round_index + 5 ) & RC_ENTRIES__MASK;
	local_round_index_6 = ( round_index + 6 ) & RC_ENTRIES__MASK;
	local_round_index_7 = ( round_index + 7 ) & RC_ENTRIES__MASK;


	ke_rotate_1 = ( KE_ROT_1 + round_index ) & 0x3f;
	ke_rotate_2 = ( KE_ROT_2 + round_index ) & 0x3f;
	ke_rotate_3 = ( KE_ROT_3 + round_index ) & 0x3f;

	switch( quadrant_a ) {

		case 0:	
			round_key[ 0 ] = principle_key[ 0 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 2 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 4 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 6 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 8 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 10 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 12 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 14 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;
		case 2:	
			round_key[ 0 ] = principle_key[ 16 ] ^ preliminary_key[ 0 ] ^ rc_u64[ local_round_index_0 ];
			round_key[ 2 ] = principle_key[ 18 ] ^ preliminary_key[ 1 ] ^ rc_u64[ local_round_index_1 ];
			round_key[ 4 ] = principle_key[ 20 ] ^ preliminary_key[ 2 ] ^ rc_u64[ local_round_index_2 ];
			round_key[ 6 ] = principle_key[ 22 ] ^ preliminary_key[ 3 ] ^ rc_u64[ local_round_index_3 ];
			round_key[ 8 ] = principle_key[ 24 ] ^ preliminary_key[ 4 ] ^ rc_u64[ local_round_index_4 ];
			round_key[ 10 ] = principle_key[ 26 ] ^ preliminary_key[ 5 ] ^ rc_u64[ local_round_index_5 ]; 
			round_key[ 12 ] = principle_key[ 28 ] ^ preliminary_key[ 6 ] ^ rc_u64[ local_round_index_6 ];
			round_key[ 14 ] = principle_key[ 30 ] ^ preliminary_key[ 7 ] ^ rc_u64[ local_round_index_7 ];
			break;

	}


	switch( quadrant_b ) {

		case 1:
			round_key[ 1 ] = ROTL_W( principle_key[ 1 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 3 ] = ROTL_W( principle_key[ 3 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 5 ] = ROTL_W( principle_key[ 5 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 7 ] = ROTL_W( principle_key[ 7 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 9 ] = ROTL_W( principle_key[ 9 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 11 ] = ROTL_W( principle_key[ 11 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 13 ] = ROTL_W( principle_key[ 13 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 15 ] = ROTL_W( principle_key[ 15 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			break;
		case 3:
			round_key[ 1 ] = ROTL_W( principle_key[ 17 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 3 ] = ROTL_W( principle_key[ 19 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 5 ] = ROTL_W( principle_key[ 21 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 7 ] = ROTL_W( principle_key[ 23 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 9 ] = ROTL_W( principle_key[ 25 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 11 ] = ROTL_W( principle_key[ 27 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 13 ] = ROTL_W( principle_key[ 29 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			round_key[ 15 ] = ROTL_W( principle_key[ 31 ], ke_rotate_1, WORD_BITS_64, WORD_MODULUS_64 );
			break;

	}

	round_key[ 16 ] = ROTL_W( round_key[ 2 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 18 ] = ROTL_W( round_key[ 0 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 20 ] = ROTL_W( round_key[ 6 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 22 ] = ROTL_W( round_key[ 4 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 24 ] = ROTL_W( round_key[ 10 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 26 ] = ROTL_W( round_key[ 8 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 28 ] = ROTL_W( round_key[ 14 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 30 ] = ROTL_W( round_key[ 12 ], ke_rotate_2, WORD_BITS_64, WORD_MODULUS_64 );

	round_key[ 17 ] = ROTL_W( round_key[ 7 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 19 ] = ROTL_W( round_key[ 5 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 21 ] = ROTL_W( round_key[ 3 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 23 ] = ROTL_W( round_key[ 1 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 25 ] = ROTL_W( round_key[ 15 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 27 ] = ROTL_W( round_key[ 13 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 29 ] = ROTL_W( round_key[ 11 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );
	round_key[ 31 ] = ROTL_W( round_key[ 9 ], ke_rotate_3, WORD_BITS_64, WORD_MODULUS_64 );

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn key extract x2, round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

}


void do__key_extract__pre_whitening( u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 round_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] ) {

	round_key[ 0 ] = principle_key[ 0 ] ^ preliminary_key[ 0 ];
	round_key[ 2 ] = principle_key[ 2 ] ^ preliminary_key[ 1 ];
	round_key[ 4 ] = principle_key[ 4 ] ^ preliminary_key[ 2 ];
	round_key[ 6 ] = principle_key[ 6 ] ^ preliminary_key[ 3 ];
	round_key[ 8 ] = principle_key[ 8 ] ^ preliminary_key[ 4 ];
	round_key[ 10 ] = principle_key[ 10 ] ^ preliminary_key[ 5 ];
	round_key[ 12 ] = principle_key[ 12 ] ^ preliminary_key[ 6 ];
	round_key[ 14 ] = principle_key[ 14 ] ^ preliminary_key[ 7 ];

	round_key[ 1 ] = principle_key[ 1 ] ^ preliminary_key[ 7 ];
	round_key[ 3 ] = principle_key[ 3 ] ^ preliminary_key[ 6 ];
	round_key[ 5 ] = principle_key[ 5 ] ^ preliminary_key[ 5 ];
	round_key[ 7 ] = principle_key[ 7 ] ^ preliminary_key[ 4 ];
	round_key[ 9 ] = principle_key[ 9 ] ^ preliminary_key[ 3 ];
	round_key[ 11 ] = principle_key[ 11 ] ^ preliminary_key[ 2 ];
	round_key[ 13 ] = principle_key[ 13 ] ^ preliminary_key[ 1 ];
	round_key[ 15 ] = principle_key[ 15 ] ^ preliminary_key[ 0 ];

	round_key[ 16 ] = principle_key[ 16 ];
	round_key[ 18 ] = principle_key[ 18 ];
	round_key[ 20 ] = principle_key[ 20 ];
	round_key[ 22 ] = principle_key[ 22 ];
	round_key[ 24 ] = principle_key[ 24 ];
	round_key[ 26 ] = principle_key[ 26 ];
	round_key[ 28 ] = principle_key[ 28 ];
	round_key[ 30 ] = principle_key[ 30 ];

	round_key[ 17 ] = principle_key[ 17 ];
	round_key[ 19 ] = principle_key[ 19 ];
	round_key[ 21 ] = principle_key[ 21 ];
	round_key[ 23 ] = principle_key[ 23 ];
	round_key[ 25 ] = principle_key[ 25 ];
	round_key[ 27 ] = principle_key[ 27 ];
	round_key[ 29 ] = principle_key[ 29 ];
	round_key[ 31 ] = principle_key[ 31 ];

	do__pht_a_diffuse( round_key );

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn prewhitening, round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

}



void do__key_extract__post_whitening( u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 round_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] ) {

	round_key[ 0 ] = principle_key[ 0 ] ^ preliminary_key[ 0 ];
	round_key[ 2 ] = principle_key[ 2 ] ^ preliminary_key[ 1 ];
	round_key[ 4 ] = principle_key[ 4 ] ^ preliminary_key[ 2 ];
	round_key[ 6 ] = principle_key[ 6 ] ^ preliminary_key[ 3 ];
	round_key[ 8 ] = principle_key[ 8 ] ^ preliminary_key[ 4 ];
	round_key[ 10 ] = principle_key[ 10 ] ^ preliminary_key[ 5 ];
	round_key[ 12 ] = principle_key[ 12 ] ^ preliminary_key[ 6 ];
	round_key[ 14 ] = principle_key[ 14 ] ^ preliminary_key[ 7 ];

	round_key[ 1 ] = principle_key[ 1 ] ^ preliminary_key[ 7 ];
	round_key[ 3 ] = principle_key[ 3 ] ^ preliminary_key[ 6 ];
	round_key[ 5 ] = principle_key[ 5 ] ^ preliminary_key[ 5 ];
	round_key[ 7 ] = principle_key[ 7 ] ^ preliminary_key[ 4 ];
	round_key[ 9 ] = principle_key[ 9 ] ^ preliminary_key[ 3 ];
	round_key[ 11 ] = principle_key[ 11 ] ^ preliminary_key[ 2 ];
	round_key[ 13 ] = principle_key[ 13 ] ^ preliminary_key[ 1 ];
	round_key[ 15 ] = principle_key[ 15 ] ^ preliminary_key[ 0 ];

	round_key[ 16 ] = principle_key[ 16 ];
	round_key[ 18 ] = principle_key[ 18 ];
	round_key[ 20 ] = principle_key[ 20 ];
	round_key[ 22 ] = principle_key[ 22 ];
	round_key[ 24 ] = principle_key[ 24 ];
	round_key[ 26 ] = principle_key[ 26 ];
	round_key[ 28 ] = principle_key[ 28 ];
	round_key[ 30 ] = principle_key[ 30 ];

	round_key[ 17 ] = principle_key[ 17 ];
	round_key[ 19 ] = principle_key[ 19 ];
	round_key[ 21 ] = principle_key[ 21 ];
	round_key[ 23 ] = principle_key[ 23 ];
	round_key[ 25 ] = principle_key[ 25 ];
	round_key[ 27 ] = principle_key[ 27 ];
	round_key[ 29 ] = principle_key[ 29 ];
	round_key[ 31 ] = principle_key[ 31 ];

	do__pht_b_diffuse( round_key );

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn key post whitening, round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

}



/* XOR a key into a state array */
void do__xor_key_with_state( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 round_key[ SGAIL__NUM_64_BIT_WORDS ] ) {

	u32 loop_counter;

	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		state_array[ loop_counter ] ^= round_key[ loop_counter ];

	}

}






/*
 * Hash Chaining Constructions
 */

/* Setup the internal state to the standard IV which is just a copy of the sbox */
void do__init__chaining_state( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], const u8 sbox[ SBOX__SIZE ] ) {

	memcpy( state_array, sbox, SBOX__SIZE );

}


/* This is the compression function core */
void do__update__chaining_state__4_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;


#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; initial state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; xlate buffer\n" );
do__display_state_buffer_64bit_words( xlate_array );
#endif


	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; state array after xor with pre whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; state array xlate mds\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 1 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round one after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q0( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round one after quad diffuse q0\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round one after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round one round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round one after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 2 : Use second x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round two after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q1( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round two after quad diffuse q1\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round two after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round two round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round two after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 3 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round three after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q2( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round three after quad diffuse q2\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round three after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round three round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round three after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 4 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round four after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q3( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round four after quad diffuse q3\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round four after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round four round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; round four after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; after final xlate mds\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; post whitening key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; after xor of post whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 4 rounds; after xor of chaining state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


void do__update__chaining_state__6_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; initial state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; xlate buffer\n" );
do__display_state_buffer_64bit_words( xlate_array );
#endif

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; state array after xor with pre whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; state array xlate mds\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 1 : Use first x2 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round one after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q0( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round one after quad diffuse q0\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round one after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x2( principle_key, 0, 1, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round one round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round one after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 2 : Use first x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round two after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q0( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round two after quad diffuse q0\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round two after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round two round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round two after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 3 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round three after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q1( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round three after quad diffuse q1\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round three after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round three round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round three after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 4 : Use third x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round four after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q2( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round four after quad diffuse q2\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round four after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round four round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round four after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 5 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round five after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q3( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round five after quad diffuse q3\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round five after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round five round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round five after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 6 : Use second x2 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round six after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q3( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round six after quad diffuse q3\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round six after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x2( principle_key, 2, 3, preliminary_key, round_key, mds_8x8s, 5 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round six round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; round six after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; after final xlate mds\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; post whitening key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; after xor of post whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 6 rounds; after xor of chaining state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


void do__update__chaining_state__8_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; initial state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; xlate buffer\n" );
do__display_state_buffer_64bit_words( xlate_array );
#endif

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; state array after xor with pre whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; state array xlate mds\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 1 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round one after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q0( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round one after quad diffuse q0\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round one after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round one round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round one after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 2 : Use second x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round two after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q1( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round two after quad diffuse q1\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round two after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round two round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round two after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 3 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round three after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q2( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round three after quad diffuse q2\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round three after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round three round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round three after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 4 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round four after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q3( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round four after quad diffuse q3\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round four after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round four round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round four after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 5 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round five after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q0( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round five after quad diffuse q0\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round five after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round five round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round five after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 6 : Use second x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round six after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q1( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round six after quad diffuse q1\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round six after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round six round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round six after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Round 7 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round seven after pht_a\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__quad_diffuse__q2( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round seven after quad diffuse q2\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__pht_b_diffuse( centre_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round seven after pht_b\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif

	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round seven round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round seven after full mds update\n" );
do__display_state_buffer_64bit_words( state_array );
#endif


	/* Round 8 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round eight after pht_a\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__quad_diffuse__q3( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round eight after quad diffuse q3\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__pht_b_diffuse( state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round eight after pht_b\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 1 );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round eight round key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; round eight after full mds update\n" );
do__display_state_buffer_64bit_words( centre_state_array );
#endif


	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; after final xlate mds\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; post whitening key\n" );
do__display_state_buffer_64bit_words( round_key );
#endif

	do__xor_key_with_state( state_array, round_key );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; after xor of post whitening key\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );
#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("\nIn chaining state 8 rounds; after xor of chaining state\n" );
do__display_state_buffer_64bit_words( state_array );
#endif

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}


}


void do__update__chaining_state__10_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );

	/* Round 1 : Use first x2 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x2( principle_key, 0, 1, preliminary_key, round_key, mds_8x8s, 0 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 2 : Use first x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q0( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 3 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q1( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 4 : Use third x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q2( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 5 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q3( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 6 : Use first x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q0( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 7 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q1( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 8 : Use third x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q2( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 9 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q3( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 10 : Use second x2 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x2( principle_key, 2, 3, preliminary_key, round_key, mds_8x8s, 5 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
	do__xor_key_with_state( state_array, round_key );

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


void do__update__chaining_state__12_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );

	/* Round 1 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 2 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 3 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 4 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 5 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 6 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 7 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 8 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 9 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 10 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 11 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 12 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
	do__xor_key_with_state( state_array, round_key );

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


void do__update__chaining_state__14_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );

	/* Round 1 : Use first x2 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x2( principle_key, 0, 1, preliminary_key, round_key, mds_8x8s, 0 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 2 : Use first x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q0( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 3 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q1( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 4 : Use third x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q2( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 5 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q3( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 6 : Use first x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q0( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 7 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q1( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 8 : Use third x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q2( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 9 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q3( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 10 : Use first x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q0( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 11 : Use second x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q1( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 12 : Use third x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q2( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 13 : Use fourth x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q3( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 14 : Use second x2 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x2( principle_key, 2, 3, preliminary_key, round_key, mds_8x8s, 5 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
	do__xor_key_with_state( state_array, round_key );

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


void do__update__chaining_state__16_rounds( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ], u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ], const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 block_count__low_word ) {

	u8 xlate_array[ SGAIL__STATE__SIZE ];
	u64 chaining_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 centre_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u32 loop_counter;

	/* Make a copy of the state array for Davies-Mayer */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		chaining_state_array[ loop_counter ] = state_array[ loop_counter ];

	}

	/* Setup the xlate matrix */
	memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
	do__permutate_xlate_buffer( xlate_array, (u8 * )principle_key, block_count__low_word & 0xff, sbox_0 );

	/* Pre-whiten */
	do__key_extract__pre_whitening( principle_key, preliminary_key, round_key, mds_8x8s );
	do__xor_key_with_state( state_array, round_key );

	/* Do first xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)state_array, centre_state_array, xlate_array, mds_8x8s );

	/* Round 1 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 2 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 3 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 4 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 5 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 6 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 7 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 8 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 9 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 10 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 11 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 12 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 13 : Use first x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q0( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 0, preliminary_key, round_key, mds_8x8s, 1 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 14 : Use second x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q1( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 1, preliminary_key, round_key, mds_8x8s, 2 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 15 : Use third x4 key */
	do__pht_a_diffuse( centre_state_array );
	do__quad_diffuse__q2( centre_state_array );
	do__pht_b_diffuse( centre_state_array );
	do__key_extract_x4( principle_key, 2, preliminary_key, round_key, mds_8x8s, 3 );
	do__full_mds_state_update( centre_state_array, state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Round 16 : Use fourth x4 key */
	do__pht_a_diffuse( state_array );
	do__quad_diffuse__q3( state_array );
	do__pht_b_diffuse( state_array );
	do__key_extract_x4( principle_key, 3, preliminary_key, round_key, mds_8x8s, 4 );
	do__full_mds_state_update( state_array, centre_state_array, round_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs );

	/* Do last xlate mds */
	do__xlate_state_mds_8x8s( (u8 *)centre_state_array, state_array, xlate_array, mds_8x8s );

	/* Post whiten */
	do__key_extract__post_whitening( principle_key, preliminary_key, round_key, mds_8x8s );	
	do__xor_key_with_state( state_array, round_key );

	/* Apply chaining */
	do__xor_key_with_state( state_array, chaining_state_array );

	/* Clear up */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		xlate_array[ loop_counter ] = 0;
		round_key[ loop_counter ] = 0;
		centre_state_array[ loop_counter ] = 0;
		chaining_state_array[ loop_counter ] = 0;

	}

}


/* Finalise the chaining construction and place the digest into the buffer */
void do__finalise__chaining_state( u64 state_array[ SGAIL__NUM_64_BIT_WORDS ], int hashbitlen, BitSequence *hashval ) {

	u32 loop_counter;

	/* If hashbitlen is <= 512 bits, then xor all four quadrants into q0 and truncate, else just truncate */
	if ( hashbitlen <= 512 ) {

		state_array[ 0 ] ^= state_array[ 1 ] ^ state_array[ 16 ] ^ state_array[ 17 ];
		state_array[ 2 ] ^= state_array[ 3 ] ^ state_array[ 18 ] ^ state_array[ 19 ];
		state_array[ 4 ] ^= state_array[ 5 ] ^ state_array[ 20 ] ^ state_array[ 21 ];
		state_array[ 6 ] ^= state_array[ 7 ] ^ state_array[ 22 ] ^ state_array[ 23 ];
		state_array[ 8 ] ^= state_array[ 9 ] ^ state_array[ 24 ] ^ state_array[ 25 ];
		state_array[ 10 ] ^= state_array[ 11 ] ^ state_array[ 26 ] ^ state_array[ 27 ];
		state_array[ 12 ] ^= state_array[ 13 ] ^ state_array[ 28 ] ^ state_array[ 29 ];
		state_array[ 14 ] ^= state_array[ 15 ] ^ state_array[ 30 ] ^ state_array[ 31 ];

		memcpy( hashval, state_array, ( hashbitlen >> 3 ) );  /* hashbitlen div 8 to get no. of bytes */
	

	} else {

		memcpy( hashval, state_array, ( hashbitlen >> 3 ) );  /* hashbitlen div 8 to get no. of bytes */

	}

	/* Now write over state array with zeros */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		state_array[ 0 ] = 0;

	}

}






/*
 * High Level Stuff
 */

/* Shadow function of the NIST Init - basically this does the work, but allows more flexible parameters */
HashReturn do__init__hash_state( hashState *state, int hashbitlen, u32 centre_rounds, u32 principle_key_rounds, u64 secret_key[ SECRET_KEY__64_BIT_WORDS ], u64 serial_number, const u8 sbox[ SBOX__SIZE ] ) {

	/* check hashbitlen first */
	if ( hashbitlen == DIGEST__224_BITS | hashbitlen == DIGEST__256_BITS | hashbitlen == DIGEST__384_BITS | hashbitlen == DIGEST__512_BITS | hashbitlen == DIGEST__768_BITS | hashbitlen == DIGEST__1024_BITS | hashbitlen == DIGEST__1536_BITS | hashbitlen == DIGEST__2048_BITS ) {

		/* Zero over the context */
		memset( state, 0, sizeof( hashState ) );

		/* Setup the state_array with the IV (which is  just a copy of the sbox) */
		do__init__chaining_state( state->state_array, sbox );

		/* Copy in the secret key */
		state->secret_key[ 0 ] = secret_key[ 0 ];
		state->secret_key[ 1 ] = secret_key[ 1 ];
		state->secret_key[ 2 ] = secret_key[ 2 ];
		state->secret_key[ 3 ] = secret_key[ 3 ];

		/* Copy in the serial_number */
		state->serial_number = serial_number;

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
do__display_secret_key( secret_key );
printf("Serial Number %ll016x\n\n: ", serial_number );
#endif


		/* Store the number of rounds to use in the compression function */
		if ( centre_rounds == CENTRE_ROUNDS__4_ROUNDS | centre_rounds == CENTRE_ROUNDS__6_ROUNDS | centre_rounds == CENTRE_ROUNDS__8_ROUNDS | centre_rounds == CENTRE_ROUNDS__10_ROUNDS | centre_rounds == CENTRE_ROUNDS__12_ROUNDS | centre_rounds == CENTRE_ROUNDS__14_ROUNDS | centre_rounds == CENTRE_ROUNDS__16_ROUNDS) {	

			state->centre_rounds = centre_rounds;

		}  else {

			return( FAIL );

		}


		/* Store the number of rounds to use in the message processing function */
		if ( principle_key_rounds == PRINCIPLE_KEY_ROUNDS__1_ROUNDS | principle_key_rounds == PRINCIPLE_KEY_ROUNDS__2_ROUNDS | principle_key_rounds == PRINCIPLE_KEY_ROUNDS__3_ROUNDS ) {

			state->principle_key_rounds = principle_key_rounds;

		} else {

			return( FAIL );

		}

		/* Setup the remaining stuff */
		state->hashbitlen = hashbitlen;

	} else {

		return( FAIL );

	}

	return( SUCCESS );	

}


/* */
HashReturn do__update__hash_state( hashState *state, const BitSequence *data, DataLength databitlen, const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u32 data__bytes_length, data__bytes_processed, data__bytes_remaining, bytes_to_copy;
	u32 processing_block__bytes_free;
	u32 loop_counter, non_zero_flag;
	u8  finalise_byte;
	u64 preliminary_key__left[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 preliminary_key__right[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 preliminary_key__combined[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ];


	/* Assuming that 8 | databitlen */
	data__bytes_length = databitlen >> 3;

	/* init */
	data__bytes_processed = 0;

	/* Main loop: copy bytes from data until we reach the block boundary, then process. */
	while ( data__bytes_processed < data__bytes_length ) {

		/* Calc how many bytes still need processing */
		data__bytes_remaining = data__bytes_length - data__bytes_processed;

		/* Calc how many bytes are free in the processing block */
		processing_block__bytes_free = SGAIL__INPUT_BLOCK__SIZE - state->partial_input_block__byte_length;

		/* Work out how many bytes we can actually copy into the remining processing block */
		if ( data__bytes_remaining < processing_block__bytes_free ) {

			bytes_to_copy = data__bytes_remaining;

		} else {

			bytes_to_copy = processing_block__bytes_free;

		}


		/* Now do the memcpy and update the counters */
		memcpy( state->partial_input_block + state->partial_input_block__byte_length, data + data__bytes_processed, bytes_to_copy );
		data__bytes_processed += bytes_to_copy;
		state->partial_input_block__byte_length += bytes_to_copy;

		/* Now we test whether we have a full block - if so process the block */
		if ( state->partial_input_block__byte_length == SGAIL__INPUT_BLOCK__SIZE ) {

			do__process_preliminary_key( preliminary_key__left, state->secret_key, mds_8x8s, state->serial_number, state->block_count__high_word, state->block_count__low_word, 0LLU );
			do__process_preliminary_key( preliminary_key__right, state->secret_key, mds_8x8s, state->serial_number, state->block_count__high_word, state->block_count__low_word + 1, 0LLU );

			for ( loop_counter = 0; loop_counter < PRELIMINARY_KEY__64_BIT_WORDS; loop_counter++ ) {

				preliminary_key__combined[ loop_counter ] = preliminary_key__left[ loop_counter ] ^ preliminary_key__right[ loop_counter ];

			}

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("------------------------------------------------------------\n");
printf("\nCurrent full input block %u:%u\n", state->block_count__high_word, state->block_count__low_word );
do__display_input_block_64bit_words( state->partial_input_block );
printf("\nPreliminary key (left) to be used for this block\n");
do__display_preliminary_key( preliminary_key__left );
printf("\nPreliminary key (right) to be used for this block\n");
do__display_preliminary_key( preliminary_key__right );
printf("\nPreliminary key (combined) to be used for this block\n");
do__display_preliminary_key( preliminary_key__combined );
#endif

			switch( state->principle_key_rounds ) {
				case PRINCIPLE_KEY_ROUNDS__1_ROUNDS:		
					do__process_principle_key__pair__1_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
					break;
				case PRINCIPLE_KEY_ROUNDS__2_ROUNDS:		
					do__process_principle_key__pair__2_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
					break;
				case PRINCIPLE_KEY_ROUNDS__3_ROUNDS:		
					do__process_principle_key__pair__3_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right,  mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
					break;
				default:
					printf("invalid rounds\n");
					exit(1);
			}

			/* Update the chaining state */
			switch( state->centre_rounds ) {			
				case CENTRE_ROUNDS__4_ROUNDS:
					do__update__chaining_state__4_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__6_ROUNDS:
					do__update__chaining_state__6_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__8_ROUNDS:
					do__update__chaining_state__8_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__10_ROUNDS:
					do__update__chaining_state__10_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__12_ROUNDS:
					do__update__chaining_state__12_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__14_ROUNDS:
					do__update__chaining_state__14_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				case CENTRE_ROUNDS__16_ROUNDS:
					do__update__chaining_state__16_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
					break;
				default:
					printf("invalid rounds\n");
					exit(1);
			}

			/* Increment the block counters */
			if ( state->block_count__low_word == WORD_MODULUS_64 || state->block_count__low_word == ( WORD_MODULUS_64 - 1 ) ) {

				state->block_count__high_word += 1;

			}
			state->block_count__low_word += 2;

			/* We're done, so can now memset over the partial block buffer + preliminary key and reset the counter */
			for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {
				
				principle_key[ loop_counter ] = 0;

			}

			for ( loop_counter = 0; loop_counter < PRELIMINARY_KEY__64_BIT_WORDS; loop_counter++ ) {

				preliminary_key__left[ loop_counter ] = 0;
				preliminary_key__right[ loop_counter ] = 0;
				preliminary_key__combined[ loop_counter ] = 0;

			}
			memset( state->partial_input_block, 0, SGAIL__INPUT_BLOCK__SIZE );
			state->partial_input_block__byte_length = 0;

		}

	}

	state->partial_input_block__bit_length = state->partial_input_block__byte_length << 3;
	
	/* Need to now check whether we've got a boundary not byte aligned (only for finalise operation) */
	/* Should always have at least one byte free in the partial buffer here (if there wasn't it would have been processed) */
	/* We don't call any finliase stuff, that is done explicity  - just mask and append the bits */
	if ( ( databitlen & 0x07 ) != 0 ) {

		finalise_byte = 0;
		if ( ( databitlen & 0x07 ) == 1 ) finalise_byte = 0x80 & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 2 ) finalise_byte = 0xc0 & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 3 ) finalise_byte = 0xe0 & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 4 ) finalise_byte = 0xf0 & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 5 ) finalise_byte = 0xf8 & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 6 ) finalise_byte = 0xfc & *(data + data__bytes_processed);
		if ( ( databitlen & 0x07 ) == 7 ) finalise_byte = 0xfe & *(data + data__bytes_processed);

		/* Copy the whole byte first */
		state->partial_input_block[ state->partial_input_block__byte_length ] = finalise_byte;

		/* Finally update our length counters */
		state->partial_input_block__bit_length ^= ( databitlen & 0x07 );
		state->partial_input_block__byte_length += 1;

	}

	return( SUCCESS );

}


/* */
HashReturn do__finalise__hash_state( hashState *state, BitSequence *hashval, const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	u64 principle_key[ SGAIL__NUM_64_BIT_WORDS ];
	u64 preliminary_key__left[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 preliminary_key__right[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 preliminary_key__combined[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u8  xlate_array[ SGAIL__STATE__SIZE ];
	u32 loop_counter;

	do__process_preliminary_key( preliminary_key__left, state->secret_key, mds_8x8s, state->serial_number, state->block_count__high_word, state->block_count__low_word, state->partial_input_block__bit_length );
	do__process_preliminary_key( preliminary_key__right, state->secret_key, mds_8x8s, state->serial_number, state->block_count__high_word, state->block_count__low_word + 1, state->partial_input_block__bit_length );

	for ( loop_counter = 0; loop_counter < PRELIMINARY_KEY__64_BIT_WORDS; loop_counter++ ) {

		preliminary_key__combined[ loop_counter ] = preliminary_key__left[ loop_counter ] ^ preliminary_key__right[ loop_counter ];

	}

#ifdef DEGUG__DISPLAY_INTERMEDIATE_VALUES
printf("------------------------------------------------------------\n");
printf("Final bit length %u, block count low word %u\n", state->partial_input_block__bit_length, state->block_count__low_word );
do__display_input_block_64bit_words( state->partial_input_block );
printf("\nPreliminary key (left) to be used for this block\n");
do__display_preliminary_key( preliminary_key__left );
printf("\nPreliminary key (right) to be used for this block\n");
do__display_preliminary_key( preliminary_key__right );
printf("\nPreliminary key (combined) to be used for this block\n");
do__display_preliminary_key( preliminary_key__combined );
#endif

	if ( state->partial_input_block__byte_length <= SGAIL__STATE__SIZE ) {

		memcpy( xlate_array, sbox_0, SGAIL__STATE__SIZE );
		do__permutate_xlate_buffer( xlate_array, state->partial_input_block, 0, sbox_0 );

		switch( state->principle_key_rounds ) {
			case PRINCIPLE_KEY_ROUNDS__1_ROUNDS:
				do__process_principle_key__single__1_rounds( (u64 *)&state->partial_input_block[ 0 ], principle_key, preliminary_key__left, xlate_array, xlate_array, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			case PRINCIPLE_KEY_ROUNDS__2_ROUNDS:		
				do__process_principle_key__single__2_rounds( (u64 *)&state->partial_input_block[ 0 ], principle_key, preliminary_key__left, xlate_array, xlate_array, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			case PRINCIPLE_KEY_ROUNDS__3_ROUNDS:		
				do__process_principle_key__single__3_rounds( (u64 *)&state->partial_input_block[ 0 ], principle_key, preliminary_key__left, xlate_array, xlate_array, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			default:
				printf("invalid rounds\n");
				exit(1);
		}				

	} else {

		switch( state->principle_key_rounds ) {
			case PRINCIPLE_KEY_ROUNDS__1_ROUNDS:		
				do__process_principle_key__pair__1_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			case PRINCIPLE_KEY_ROUNDS__2_ROUNDS:		
				do__process_principle_key__pair__2_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			case PRINCIPLE_KEY_ROUNDS__3_ROUNDS:		
				do__process_principle_key__pair__3_rounds( (u64 *)&state->partial_input_block[ 0 ], (u64 *)&state->partial_input_block[ SGAIL__STATE__SIZE ], principle_key, preliminary_key__left, preliminary_key__right, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox_0 );
				break;
			default:
				printf("invalid rounds\n");
				exit(1);
		}

	}

	/* Update the chaining state */
	switch( state->centre_rounds ) {			
		case CENTRE_ROUNDS__4_ROUNDS:
			do__update__chaining_state__4_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__6_ROUNDS:
			do__update__chaining_state__6_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__8_ROUNDS:
			do__update__chaining_state__8_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__10_ROUNDS:
			do__update__chaining_state__10_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__12_ROUNDS:
			do__update__chaining_state__12_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__14_ROUNDS:
			do__update__chaining_state__14_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		case CENTRE_ROUNDS__16_ROUNDS:
			do__update__chaining_state__16_rounds( state->state_array, preliminary_key__combined, principle_key, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, state->block_count__low_word );
			break;
		default:
			printf("invalid rounds\n");
			exit(1);
	}

	/* Now just finalise and return out result */
	do__finalise__chaining_state( state->state_array, state->hashbitlen, hashval );

	/* We're done, so can now memset over the partial block buffer and reset the counters */
	for ( loop_counter = 0; loop_counter < SGAIL__NUM_64_BIT_WORDS; loop_counter++ ) {

		principle_key[ loop_counter ] = 0;

	}

	for ( loop_counter = 0; loop_counter < PRELIMINARY_KEY__64_BIT_WORDS; loop_counter++ ) {

		preliminary_key__left[ loop_counter ] = 0;
		preliminary_key__right[ loop_counter ] = 0;
		preliminary_key__combined[ loop_counter ] = 0;

	}

	for ( loop_counter = 0; loop_counter < SECRET_KEY__64_BIT_WORDS; loop_counter++ ) {

		state->secret_key[ loop_counter ] = 0;

	}

	memset( xlate_array, 0, SGAIL__STATE__SIZE );

	memset( state->partial_input_block, 0, SGAIL__INPUT_BLOCK__SIZE );
	state->block_count__high_word = 0;
	state->block_count__low_word = 0;
	state->partial_input_block__byte_length = 0;
	state->partial_input_block__bit_length = 0;
	state->serial_number = 0;

	return( SUCCESS );

}


/* */
HashReturn do__quick__hash( int hashbitlen, const BitSequence *data, DataLength databitlen, BitSequence *hashval, u32 centre_rounds, u32 principle_key_rounds, u64 secret_key[ SECRET_KEY__64_BIT_WORDS ], u64 serial_number, const u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], const u8 sbox[ SBOX__SIZE ] ) {

	HashReturn init_result, update_result, finalise_result;
	hashState  *state;

	state = (hashState *) malloc( sizeof( hashState ) );
	if ( state == NULL ) {

		printf("out of memory\n");
		exit(1);

	}

	init_result = do__init__hash_state( state, hashbitlen, centre_rounds, principle_key_rounds, secret_key, serial_number, sbox );
	if ( init_result != SUCCESS ) {

		free( state );
		return( init_result );	

	}

	update_result = do__update__hash_state( state, data, databitlen, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );
	if ( update_result != SUCCESS ) {

		free( state );
		return( update_result );	

	}

	finalise_result = do__finalise__hash_state( state, hashval, mds_8x8s, mds_16x8s_lhs, mds_16x8s_rhs, sbox );
	if ( finalise_result != SUCCESS ) {
		
		free( state );
		return( finalise_result );	

	}

	free( state );

	return( SUCCESS );

}






/*
 * Minibox and Sbox generation functions
 */

void do__generate_minibox_set( u64 u64_key, u8 miniboxes[ SBOX_GEN__MINIBOXES ][ MINIBOX__SIZE ] ) {

	u32 loop_counter, loop_counter_b;
	u8  minibox[ MINIBOX__SIZE ] = { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f };
	u8  temp_val;
	u32  counter_j;
	u32 local_loop_counter;
	float dp_max, lp_max;
	u32 dp_max_int, lp_max_int;
	u32 non_linear_count, fixed_points;
	u32 complete_miniboxes;
	u64 u64_key_a, u64_key_b, u64_key_c, u64_result_key;

	counter_j = 0;
	complete_miniboxes = 0;
	loop_counter = 0;

	u64_key_a = u64_key;
	u64_key_b = u64_key;
	u64_key_c = u64_key;
	u64_result_key = 0;

	while ( complete_miniboxes < SBOX_GEN__MINIBOXES ) {

		loop_counter += 1;
		local_loop_counter = loop_counter & 0x0f;

		counter_j += local_loop_counter ^ ( minibox[ local_loop_counter ] ) ^ ( u64_result_key & 0x0f );
		counter_j = minibox[ counter_j & 0x0f ] + minibox[ local_loop_counter ];
		counter_j = minibox[ counter_j & 0x0f ];

		u64_key_a = ROTL_W( u64_key_a, 3, WORD_BITS_64, WORD_MODULUS_64 );		
		u64_key_b = ROTL_W( u64_key_b, 5, WORD_BITS_64, WORD_MODULUS_64 );		
		u64_key_c = ROTL_W( u64_key_c, 7, WORD_BITS_64, WORD_MODULUS_64 );		
		u64_result_key = ( u64_key_a & u64_key_b ) | ( ( ~u64_key_a ) & u64_key_c );
		
		temp_val = minibox[ local_loop_counter ];
		minibox[ local_loop_counter ] = minibox[ counter_j ];
		minibox[ counter_j ] = temp_val;
		

		if ( loop_counter % 250 == 1 && loop_counter > 1 ) {

			dp_max = do__calc_minibox_dpmax( minibox );
			lp_max = do__calc_minibox_lpmax( minibox );
			non_linear_count = do__minibox_nonlinear_analysis( minibox );
			fixed_points = get__minibox_fixed_points( minibox );
			dp_max_int = (u32 )( dp_max * 16.0 );
			lp_max_int = (u32 )( lp_max * 16.0 );

			if ( dp_max_int <= 4 && lp_max_int == 0 && non_linear_count == 4 && fixed_points == 0 ) {

				for ( loop_counter_b = 0; loop_counter_b < MINIBOX__SIZE; loop_counter_b ) {

					miniboxes[ complete_miniboxes ][ loop_counter_b ] = minibox[ loop_counter_b ];

				}

				complete_miniboxes += 1;

			}			

		}

	}

}


void do__construct_sbox_from_miniboxes( u8 sbox[ SBOX__SIZE ], u8 miniboxes[ SBOX_GEN__MINIBOXES ][ MINIBOX__SIZE ] ) {

	u32 loop_counter;
	u32 result_a, result_b, result_c, result_d, result_e, result_f, result_g;
	u32 side_result_a, side_result_b;
	
	for ( loop_counter = 0; loop_counter < SBOX__SIZE; loop_counter++ ) {

		result_a = miniboxes[ 0 ][ loop_counter & 0x0f ] ^ ( miniboxes[ 1 ][ ( loop_counter >> 4 ) & 0x0f ] << 4 );
		result_a = ROTL_W( result_a, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_b = miniboxes[ 2 ][ result_a & 0x0f ] ^ ( miniboxes[ 3 ][ ( result_a >> 4 ) & 0x0f ] << 4 );
		result_b = ROTL_W( result_b, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_c = miniboxes[ 4 ][ result_b & 0x0f ] ^ ( miniboxes[ 5 ][ ( result_b >> 4 ) & 0x0f ] << 4 );
		result_c = ROTL_W( result_c, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_d = miniboxes[ 6 ][ result_c & 0x0f ] ^ ( miniboxes[ 7 ][ ( result_c >> 4 ) & 0x0f ] << 4 );
		result_d = ROTL_W( result_d, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_e = miniboxes[ 8 ][ result_d & 0x0f ] ^ ( miniboxes[ 9 ][ ( result_d >> 4 ) & 0x0f ] << 4 );
		result_e = ROTL_W( result_e, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_f = miniboxes[ 10 ][ result_e & 0x0f ] ^ ( miniboxes[ 11 ][ ( result_e >> 4 ) & 0x0f ] << 4 );
		result_f = ROTL_W( result_f, 2, WORD_BITS_8, WORD_MODULUS_8 );

		result_g = miniboxes[ 12 ][ result_f & 0x0f ] ^ ( miniboxes[ 13 ][ ( result_f >> 4 ) & 0x0f ] << 4 );
		result_g = ROTL_W( result_g, 2, WORD_BITS_8, WORD_MODULUS_8 );

		sbox[ loop_counter ] = miniboxes[ 14 ][ result_g & 0x0f ] ^ ( miniboxes[ 15 ][ ( result_g >> 4 ) & 0x0f ] << 4 );

//		printf("%u : ra = %02x; rb = %02x; rc = %02x; rd = %02x; re = %02x; rf = %02x; rg = %02x; sbox = %02x\n", loop_counter, result_a, result_b, result_c, result_d, result_e, result_f, result_g, sbox[ loop_counter ] );

	}

}


u32 do__construct_sbox_set( u8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] ) {

	u8  full_miniboxes[ SBOX_SELECTION_SET__SIZE ][ SBOX_GEN__MINIBOXES ][ MINIBOX__SIZE ];
	u8  sbox_selection_set[ SBOX_SELECTION_SET__SIZE ][ SBOX__SIZE ];
	u32 loop_counter, loop_counter_b;
	u32 selection_counter;


	printf("Processing:\n");
	for ( loop_counter = 0; loop_counter < SBOX_SELECTION_SET__SIZE; loop_counter++ ) {

		printf("%u : .", loop_counter );
		do__generate_minibox_set( rc_u64[ loop_counter ], full_miniboxes[ loop_counter ] );
		printf(".\n");
		do__construct_sbox_from_miniboxes( sbox_selection_set[ loop_counter ], full_miniboxes[ loop_counter ] );

	}


	printf("\n\n\nMini boxes\n");
	printf("---------------------\n");
	for ( loop_counter = 0; loop_counter < SBOX_SELECTION_SET__SIZE; loop_counter++ ) {

		printf("\n[ Mini box set no. %u ]\n", loop_counter );		
		for ( loop_counter_b = 0; loop_counter_b < MINIBOX__SIZE; loop_counter_b++ ) {

			do__display_minibox( full_miniboxes[ loop_counter ][ loop_counter_b ] );

		}

	}

	for ( loop_counter = 0; loop_counter < SBOX_SELECTION_SET__SIZE; loop_counter++ ) {

		printf( "\n[ Selection sbox no. %u ]\n", loop_counter );
		do__full_sbox_analysis( sbox_selection_set[ loop_counter ] );

	}


	printf("\n\n\n------------------------------------------------------------\n\n\n");
	printf("Finding first %u suitable sboxes from selection list\n\n", SBOX_SET__SIZE );

	selection_counter = 0;
	for ( loop_counter = 0; loop_counter < SBOX_SELECTION_SET__SIZE; loop_counter++ ) {

		if ( selection_counter < SBOX_SET__SIZE && get__is_sbox_good( sbox_selection_set[ loop_counter ], 12, 6, 100, 1 ) ) {

			printf("Got sbox %u from selection list sbox %u\n", selection_counter, loop_counter );
			memcpy( sbox_set[ selection_counter ], sbox_selection_set[ loop_counter ], SBOX__SIZE );
			selection_counter += 1;

		}

	}


	printf("\n------------------------------------------------------------\n");
	printf("Final sboxes\n\n", SBOX_SET__SIZE );

	for ( loop_counter = 0; loop_counter < selection_counter; loop_counter++ ) {

		printf( "\n[ Sbox no. %u ]\n", loop_counter );
		do__full_sbox_analysis( sbox_set[ loop_counter ] );

	}

	return(selection_counter);

}






/*
 * Sbox and minibox analysis functions
 */

float do__calc_sbox_dpmax( const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b, loop_counter_c;
	u32 max_c_counter, cur_c_counter;
	float dp_max;

	max_c_counter = 0;

	for ( loop_counter_a = 1; loop_counter_a < SBOX__SIZE; loop_counter_a++ ) {

		for ( loop_counter_b = 0; loop_counter_b < SBOX__SIZE; loop_counter_b++ ) {

			cur_c_counter = 0;

			for ( loop_counter_c = 0; loop_counter_c < SBOX__SIZE; loop_counter_c++ ) {

//				if ( loop_counter_a != loop_counter_b ) {

					if ( ( sbox[ loop_counter_c ^ loop_counter_a ] ^ sbox[ loop_counter_c ] ) == loop_counter_b ) {

						cur_c_counter += 1;

					}

//				}

			}

			if ( cur_c_counter > max_c_counter ) max_c_counter = cur_c_counter;			

		}

	}

	dp_max = (float)max_c_counter * ( 1.0 / 256.0 );

	return( dp_max );

}


float do__calc_minibox_dpmax( const u8 minibox[ MINIBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b, loop_counter_c;
	u32 max_c_counter, cur_c_counter;
	float dp_max;

	max_c_counter = 0;

	for ( loop_counter_a = 1; loop_counter_a < MINIBOX__SIZE; loop_counter_a++ ) {

		for ( loop_counter_b = 0; loop_counter_b < MINIBOX__SIZE; loop_counter_b++ ) {

			cur_c_counter = 0;

			for ( loop_counter_c = 0; loop_counter_c < MINIBOX__SIZE; loop_counter_c++ ) {

//				if ( loop_counter_a != loop_counter_b ) {

					if ( ( minibox[ loop_counter_c ^ loop_counter_a ] ^ minibox[ loop_counter_c ] ) == loop_counter_b ) {

						cur_c_counter += 1;

					}

//				}

			}

			if ( cur_c_counter > max_c_counter ) max_c_counter = cur_c_counter;			

		}

	}

	
	dp_max = (float)max_c_counter * ( 1.0 / 16.0 );

	return( dp_max );

}


float do__calc_sbox_lpmax( const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b, loop_counter_c;
	u32 max_c_counter, cur_c_counter;
	float lp_max;

	max_c_counter = 0;

	for ( loop_counter_a = 0; loop_counter_a < SBOX__SIZE; loop_counter_a++ ) {

		for ( loop_counter_b = 1; loop_counter_b < SBOX__SIZE; loop_counter_b++ ) {

			cur_c_counter = 0;

			for ( loop_counter_c = 0; loop_counter_c < SBOX__SIZE; loop_counter_c++ ) {

					if ( do__get_dot_product( loop_counter_c, loop_counter_a ) == do__get_dot_product( sbox[ loop_counter_c ], loop_counter_b ) ) {

						cur_c_counter += 1;

					}

			}

			if ( cur_c_counter > max_c_counter ) max_c_counter = cur_c_counter;			

		}

	}

	lp_max = (float)max_c_counter * ( 1.0 / 256.0 );
	lp_max = ( 2 * lp_max ) - 1;
	lp_max = lp_max * lp_max;

	return( lp_max );

}


float do__calc_minibox_lpmax( const u8 minibox[ MINIBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b, loop_counter_c;
	u32 max_c_counter, cur_c_counter;
	float lp_max;

	max_c_counter = 0;

	for ( loop_counter_a = 0; loop_counter_a < MINIBOX__SIZE; loop_counter_a++ ) {

		for ( loop_counter_b = 1; loop_counter_b < MINIBOX__SIZE; loop_counter_b++ ) {

			cur_c_counter = 0;

			for ( loop_counter_c = 0; loop_counter_c < MINIBOX__SIZE; loop_counter_c++ ) {

					if ( do__get_dot_product( loop_counter_c, loop_counter_a ) == do__get_dot_product( minibox[ loop_counter_c ], loop_counter_b ) ) {

						cur_c_counter += 1;

					}

			}

			if ( cur_c_counter > max_c_counter ) max_c_counter = cur_c_counter;			

		}

	}

	
	lp_max = (float)max_c_counter * ( 1.0 / 16.0 );
	lp_max = ( 2 * lp_max ) - 1;
	lp_max = lp_max * lp_max;

	return( lp_max );

}


u8 get__sbox_affine_evaluation( u8 in_x, u8 in_coeff, u8 in_b ) {

	int loop_counter;
	u8 u8_result, u8_pre_result;

	u8_result = 0;
	u8_pre_result = in_x & in_coeff;

	for ( loop_counter = 0; loop_counter < 8; loop_counter++ ) {

		u8_result ^= ( ( u8_pre_result >> loop_counter ) & 1 );

	}

	u8_result ^= ( in_b & 1 );

	return( u8_result );
}


u8 get_minibox_affine_evaluation( u8 in_x, u8 in_coeff, u8 in_b ) {

	int loop_counter;
	u8 u8_result, u8_pre_result;

	u8_result = 0;
	u8_pre_result = in_x & in_coeff;

	for ( loop_counter = 0; loop_counter < 4; loop_counter++ ) {

		u8_result ^= ( ( u8_pre_result >> loop_counter ) & 1 );

	}

	u8_result ^= ( in_b & 1 );

	return( u8_result );
}


u32 do__sbox_nonlinear_analysis( const u8 sbox[ SBOX__SIZE ] ) {

	int loop_counter;
	int inner_loop_counter;
	int bit_loop_counter;
	u8 hamming_weight;
	u8 affine_result, g_result;
	int current_distance_to_affine[ 8 ];
	int minimum_affine_count[ 8 ];
	u32 minimum_affine_count_result;

	for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {
		minimum_affine_count[ bit_loop_counter ] = 99999999;
	}

	// loop over all affine functions
	for ( loop_counter = 0; loop_counter < 256; loop_counter++ ) {

		// enumerate over x for affine functions (b=0)
		memset( current_distance_to_affine, 0, sizeof( current_distance_to_affine ) );
		for ( inner_loop_counter = 0; inner_loop_counter < 256; inner_loop_counter++ ) {

			// do each bit
			for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {

				// work out the result of the sbox nl function and compare
				g_result = ( sbox[ inner_loop_counter ] >> bit_loop_counter ) & 0x01;

				// we can just use the lsb (effectively just computed the dot product)
				affine_result = get__sbox_affine_evaluation( inner_loop_counter, loop_counter, 0 );

				// if different then increment counter
				if ( affine_result != g_result ) {
					 current_distance_to_affine[ bit_loop_counter ]++; 			
				}

			}

		}

//		printf("For affine fn %d (b=0), distance to an affine function is %d\n", loop_counter, current_distance_to_affine );

		for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {	
			if ( current_distance_to_affine[ bit_loop_counter ] < minimum_affine_count[ bit_loop_counter ] ) minimum_affine_count[ bit_loop_counter ] = current_distance_to_affine[ bit_loop_counter ];
		}

		// enumerate over x for affine functions (b=1)
		memset( current_distance_to_affine, 0, sizeof( current_distance_to_affine ) );
		for ( inner_loop_counter = 0; inner_loop_counter < 256; inner_loop_counter++ ) {

			// do each bit
			for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {

				// work out the result of the sbox nl function and compare
				g_result = ( sbox[ inner_loop_counter ] >> bit_loop_counter ) & 0x01;

				// we can just use the lsb (effectively just computed the dot product)
				affine_result = get__sbox_affine_evaluation( inner_loop_counter, loop_counter, 0 );

				// if different then increment counter
				if ( affine_result != g_result ) {
					 current_distance_to_affine[ bit_loop_counter ]++; 			
	//				 printf( "%d, %d -> %d, %d : %d (b=0) : sbox entry = %d\n", loop_counter, inner_loop_counter, g_result, affine_result, current_distance_to_affine, sbox[ inner_loop_counter ] );
				}

			}

		}

//		printf("For affine fn %d (b=0), distance to an affine function is %d\n", loop_counter, current_distance_to_affine );

		for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {	
			if ( current_distance_to_affine[ bit_loop_counter ] < minimum_affine_count[ bit_loop_counter ] ) minimum_affine_count[ bit_loop_counter ] = current_distance_to_affine[ bit_loop_counter ];
		}



	}

	// loop over all affine functions
//	printf("\n\n");
	minimum_affine_count_result = 9999999;
	for ( bit_loop_counter = 0; bit_loop_counter < 8; bit_loop_counter++ ) {	
//		printf("Nonlinearity for bit %d : [ %d ]\n", bit_loop_counter, minimum_affine_count[ bit_loop_counter ] );
		if ( minimum_affine_count[ bit_loop_counter ] < minimum_affine_count_result ) minimum_affine_count_result = minimum_affine_count[ bit_loop_counter ];
	}

	return( minimum_affine_count_result );

}


u32 do__minibox_nonlinear_analysis( u8 minibox[ MINIBOX__SIZE ] ) {

	int loop_counter;
	int inner_loop_counter;
	int bit_loop_counter;
	u8 hamming_weight;
	u8 affine_result, g_result;
	int current_distance_to_affine[ 4 ];
	int minimum_affine_count[ 4 ];
	u32 minimum_affine_count_result;

	for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {
		minimum_affine_count[ bit_loop_counter ] = 99999999;
	}

	// loop over all affine functions
	for ( loop_counter = 0; loop_counter < MINIBOX__SIZE; loop_counter++ ) {

		// enumerate over x for affine functions (b=0)
		memset( current_distance_to_affine, 0, sizeof( current_distance_to_affine ) );
		for ( inner_loop_counter = 0; inner_loop_counter < MINIBOX__SIZE; inner_loop_counter++ ) {

			// do each bit
			for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {

				// work out the result of the sbox nl function and compare
				g_result = ( minibox[ inner_loop_counter ] >> bit_loop_counter ) & 0x01;

				// we can just use the lsb (effectively just computed the dot product)
				affine_result = get_minibox_affine_evaluation( inner_loop_counter, loop_counter, 0 );

				// if different then increment counter
				if ( affine_result != g_result ) {
					 current_distance_to_affine[ bit_loop_counter ]++; 			
				}

			}

		}

//		printf("For affine fn %d (b=0), distance to an affine function is %d\n", loop_counter, current_distance_to_affine );

		for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {	
			if ( current_distance_to_affine[ bit_loop_counter ] < minimum_affine_count[ bit_loop_counter ] ) minimum_affine_count[ bit_loop_counter ] = current_distance_to_affine[ bit_loop_counter ];
		}

		// enumerate over x for affine functions (b=1)
		memset( current_distance_to_affine, 0, sizeof( current_distance_to_affine ) );
		for ( inner_loop_counter = 0; inner_loop_counter < MINIBOX__SIZE; inner_loop_counter++ ) {

			// do each bit
			for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {

				// work out the result of the sbox nl function and compare
				g_result = ( minibox[ inner_loop_counter ] >> bit_loop_counter ) & 0x01;

				// we can just use the lsb (effectively just computed the dot product)
				affine_result = get_minibox_affine_evaluation( inner_loop_counter, loop_counter, 0 );

				// if different then increment counter
				if ( affine_result != g_result ) {
					 current_distance_to_affine[ bit_loop_counter ]++; 			
	//				 printf( "%d, %d -> %d, %d : %d (b=0) : sbox entry = %d\n", loop_counter, inner_loop_counter, g_result, affine_result, current_distance_to_affine, sbox[ inner_loop_counter ] );
				}

			}

		}

//		printf("For affine fn %d (b=0), distance to an affine function is %d\n", loop_counter, current_distance_to_affine );

		for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {	
			if ( current_distance_to_affine[ bit_loop_counter ] < minimum_affine_count[ bit_loop_counter ] ) minimum_affine_count[ bit_loop_counter ] = current_distance_to_affine[ bit_loop_counter ];
		}



	}

	// loop over all affine functions
//	printf("\n\n");
	minimum_affine_count_result = 9999999;
	for ( bit_loop_counter = 0; bit_loop_counter < 4; bit_loop_counter++ ) {	
//		printf("Nonlinearity for bit %d : [ %d ]\n", bit_loop_counter, minimum_affine_count[ bit_loop_counter ] );
		if ( minimum_affine_count[ bit_loop_counter ] < minimum_affine_count_result ) minimum_affine_count_result = minimum_affine_count[ bit_loop_counter ];
	}

	return( minimum_affine_count_result );

}


u32  get__sbox_fixed_points( const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter;
	u32 fixed_points;

	fixed_points = 0;

	for ( loop_counter = 0; loop_counter < SBOX__SIZE; loop_counter++ ) {

		if ( sbox[ loop_counter ] == loop_counter ) fixed_points += 1;

	}

	return( fixed_points );	

}


u32  get__minibox_fixed_points( u8 minibox[ MINIBOX__SIZE ] ) {

	u32 loop_counter;
	u32 fixed_points;

	fixed_points = 0;

	for ( loop_counter = 0; loop_counter < MINIBOX__SIZE; loop_counter++ ) {

		if ( minibox[ loop_counter ] == loop_counter ) fixed_points += 1;

	}

	return( fixed_points );	

}


u32  get__is_sbox_bijective( const u8 sbox[ SBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b;
	u32 have_found_flag, is_bijective_flag;

	is_bijective_flag = 1;

	for ( loop_counter_a = 0; loop_counter_a < SBOX__SIZE; loop_counter_a++ ) {

		have_found_flag = 0;
		for ( loop_counter_b = 0; loop_counter_b < SBOX__SIZE; loop_counter_b++ ) {

			if ( sbox[ loop_counter_b ] == loop_counter_a ) have_found_flag = 1;

		}
		if ( have_found_flag == 0 ) { is_bijective_flag = 0; printf("not found at %u\n", loop_counter_a ); }
	
	}

	return( is_bijective_flag );

}


u32  get__is_sbox_good( const u8 sbox[ SBOX__SIZE ], u32 dp_max_lim, u32 lp_max_lim, u32 nl_count_min, u32 fp_lim ) {

	float dp_max, lp_max;
	u32 dp_max_int, lp_max_int;
	u32 non_linear_count, fixed_points;

	dp_max = do__calc_sbox_dpmax( sbox );
	lp_max = do__calc_sbox_lpmax( sbox );
	non_linear_count = do__sbox_nonlinear_analysis( sbox );
	fixed_points = get__sbox_fixed_points( sbox );
	dp_max_int = (u32 )( dp_max * 256.0 );
	lp_max_int = (u32 )( lp_max * 256.0 );

	if ( dp_max <= dp_max_lim && lp_max <= lp_max_lim && non_linear_count >= nl_count_min && fixed_points <= fp_lim ) {

		return( 1 );

	} else {

		return( 0 );
	
	}

}


void do__full_sbox_analysis( const u8 sbox[ SBOX__SIZE ] ) {

	float dp_max, lp_max;
	u32 dp_max_int, lp_max_int;
	u32 non_linear_count, fixed_points;

	printf("-------------------------------------\n");

	dp_max = do__calc_sbox_dpmax( sbox );
	lp_max = do__calc_sbox_lpmax( sbox );
	non_linear_count = do__sbox_nonlinear_analysis( sbox );
	fixed_points = get__sbox_fixed_points( sbox );
	dp_max_int = (u32 )( dp_max * 256.0 );
	lp_max_int = (u32 )( lp_max * 256.0 );
	printf("dp_max = %f (%u / 256), lp_max = %f (%u / 256), non_linear = %u, fixed_points = %u\n", dp_max, dp_max_int, lp_max, lp_max_int, non_linear_count, fixed_points );

	if ( get__is_sbox_bijective( sbox ) == 0 ) { 

		printf("** Not bijective **\n");

	} else {

		printf("** Bijective **\n" );

	}

	printf("-------------------------------------\n\n\n");

}


void do__full_minibox_analysis( u8 minibox[ MINIBOX__SIZE ] ) {

	float dp_max, lp_max;
	u32 dp_max_int, lp_max_int;
	u32 non_linear_count, fixed_points;

//	printf("-------------------------------------\n");

	dp_max = do__calc_minibox_dpmax( minibox );
	lp_max = do__calc_minibox_lpmax( minibox );
	non_linear_count = do__minibox_nonlinear_analysis( minibox );
	fixed_points = get__minibox_fixed_points( minibox );
	dp_max_int = (u32 )( dp_max * 16.0 );
	lp_max_int = (u32 )( lp_max * 16.0 );
//	printf("dp_max = %f (%u / 16), lp_max = %f (%u / 16), non_linear = %u, fixed_points = %u\n", dp_max, dp_max_int, lp_max, lp_max_int, non_linear_count, fixed_points );
//	do__sbox_nonlinear_analysis( sbox );

//	printf("\n-------------------------------------\n\n\n");

}






/*
 * Testing Functions
 */


/* Test byte hamming weight (branch no - 1) of the pht and quad diffuse layers */
void do__test_hamming_weight__pht_qd() {

	u32 loop_counter, loop_counter_b, loop_counter_c;
	u32 this_row;
	u32 hw_counter, hw_min;
	u64 output_vector[ 2 ];
	u8  test_array[ 256 ];
	u8 major_cauchy_matrix[ MAJOR_CAUCHY_MATRIX__DIMENSION ][ MAJOR_CAUCHY_MATRIX__DIMENSION ];
	u8 ff_array_x1b[ GF__SIZE ][ GF__SIZE ];
	u64 mds_test_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], mds_test_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ];

	do__create_finite_field_multiply_lut( 0x1b, ff_array_x1b );
	do__generate_cauchy_matrix( MAJOR_CAUCHY_MATRIX__DIMENSION, major_cauchy_array_f, major_cauchy_array_g, (u8 *)major_cauchy_matrix, ff_array_x1b );
	do__prepare_test_mds_16x8s_table( major_cauchy_matrix, ff_array_x1b, mds_test_16x8s_lhs, mds_test_16x8s_rhs, sbox_set_0 );

	hw_min = 256;
	for ( loop_counter = 0; loop_counter < 256; loop_counter++ ) {
		for ( loop_counter_b = 1; loop_counter_b < 256; loop_counter_b++ ) {

			memset( test_array, 0, 256 );
			test_array[ loop_counter ] = loop_counter_b;			
			this_row = loop_counter & 0xf0;

			/* Do the 16-bit mds */
			do__single_mds_16x8s( &test_array[ this_row ], output_vector, mds_test_16x8s_lhs, mds_test_16x8s_rhs );			
			memcpy( &test_array[ this_row ], output_vector, 16 );

			/* Now do normal diffuse stage */
			do__pht_a_diffuse( (u64 *)test_array );
			do__quad_diffuse__q0( (u64 *)test_array );
			do__pht_b_diffuse( (u64 *)test_array );

			hw_counter = 0;
			for ( loop_counter_c = 0; loop_counter_c < 256; loop_counter_c++ ) {

				if ( loop_counter_c != loop_counter && test_array[ loop_counter_c ] != 0 ) {

					hw_counter += 1;

				} 

				if ( loop_counter_c == loop_counter && test_array[ loop_counter_c ] != loop_counter_b ) {

					hw_counter += 1;

				}

			}
			if ( hw_counter < hw_min ) hw_min = hw_counter;
			if ( hw_counter == 56 ) {
				printf("\n\n\n\nIteration no %u : [ %u ]\n", loop_counter, loop_counter_b );				
				do__display_state_buffer_64bit_words( (u64 *)test_array );
			}

		}

	}

	printf("Min hamming weight = %u\n", hw_min );

}


/* Test that all the code works correctly */
void do__run_internal_test_suite( ) {

	u8 ff_array_x1b[ GF__SIZE ][ GF__SIZE ];
	u8 major_cauchy_matrix[ MAJOR_CAUCHY_MATRIX__DIMENSION ][ MAJOR_CAUCHY_MATRIX__DIMENSION ];
	u8 minor_cauchy_matrix[ MINOR_CAUCHY_MATRIX__DIMENSION ][ MINOR_CAUCHY_MATRIX__DIMENSION ];

	printf("\n\n\nRunning internal tests...\n\n");

	/* Run tests on finite field functions */
	do__create_finite_field_multiply_lut( 0x1b, ff_array_x1b );

	if ( get__finite_field_lut_verification( ff_array_x1b ) == VS__VALID ) {

		printf("+ Finite field created is valid.\n");

	} else {

		printf("- Finite field created is *NOT* valid.\n");

	}

	if ( get__finite_field_verify_inverse_element( ff_array_x1b ) == VS__VALID ) {

		printf("+ Finite field inverse element finder is valid.\n");

	} else {

		printf("- Finite field inverse element finder is *NOT* valid.\n");

	}


	/* Check Cauchy Matrix Generation */
	if ( do__generate_cauchy_matrix( MAJOR_CAUCHY_MATRIX__DIMENSION, major_cauchy_array_f, major_cauchy_array_g, (u8 *)major_cauchy_matrix, ff_array_x1b ) == IES__SUCCESS ) {

		printf("+ Major Cauchy matrix generation OK\n");

	} else {

		printf("- Major Cauchy matrix generation *FAILED*\n");

	}

	if ( do__generate_cauchy_matrix( MINOR_CAUCHY_MATRIX__DIMENSION, minor_cauchy_array_f, minor_cauchy_array_g, (u8 *)minor_cauchy_matrix, ff_array_x1b ) == IES__SUCCESS ) {

		printf("+ Major Cauchy matrix generation OK\n");

	} else {

		printf("- Major Cauchy matrix generation *FAILED*\n");

	}

	/* Check Sbox  */
	if ( get__sbox_verification( sbox_0 ) == VS__VALID ) {

		printf("+ Sbox is valid\n" );

	} else {

		printf("- Sbox is *NOT* valid\n" );

	}

}






/*
 * General Optimisation Functions
 */

void do__find_optimal_quad_diffuse_rotate_constants( u32 byte_hamming_weight_threshold, u32 bit_hamming_weight_threshold ) {

	u32 seed_word_loop_counter, seed_word_bit_loop_counter, trial_rot_value_0, trial_rot_value_1, trial_rot_value_2, trial_rot_value_3;	
	u64 test_array[ 8 ];
	u32 byte_hamming_weight, bit_hamming_weight;
	u32 meet_threshold_flag;
	u32 current_min_byte_hamming_weight, current_min_bit_hamming_weight;
	u32 current_max_byte_hamming_weight, current_max_bit_hamming_weight;
	u32 trailing_byte_threshold;

	trial_rot_value_0 = 0;
	trial_rot_value_1 = 0;
	trial_rot_value_2 = 0;
	trial_rot_value_3 = 0;
	trailing_byte_threshold = 0;

	for ( trial_rot_value_0 = 1; trial_rot_value_0 < 64; trial_rot_value_0++ ) {

		for ( trial_rot_value_1 = 1; trial_rot_value_1 < 64; trial_rot_value_1++ ) {

			for ( trial_rot_value_2 = 1; trial_rot_value_2 < 64; trial_rot_value_2++ ) {

//				for ( trial_rot_value_3 = 1; trial_rot_value_3 < 64; trial_rot_value_3++ ) {

					/* **** Start test **** */
					meet_threshold_flag = 1;
					current_min_byte_hamming_weight = 64;
					current_min_bit_hamming_weight = 512;
					current_max_byte_hamming_weight = 0;
					current_max_bit_hamming_weight = 0;

					for ( seed_word_loop_counter = 0; seed_word_loop_counter < 8; seed_word_loop_counter++ ) {

						for ( seed_word_bit_loop_counter = 0; seed_word_bit_loop_counter < 64; seed_word_bit_loop_counter++ ) {

							memset( test_array, 0, 64 );

							test_array[ seed_word_loop_counter ] = 1LLU << ( seed_word_bit_loop_counter );

							test_array[ 0 ] ^= ROTL_W( test_array[ 0 ] ^ test_array[ 7 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 1 ] ^= ROTL_W( test_array[ 1 ] ^ test_array[ 0 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 2 ] ^= ROTL_W( test_array[ 2 ] ^ test_array[ 1 ], trial_rot_value_2, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 3 ] ^= ROTL_W( test_array[ 3 ] ^ test_array[ 2 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 4 ] ^= ROTL_W( test_array[ 4 ] ^ test_array[ 3 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 5 ] ^= ROTL_W( test_array[ 5 ] ^ test_array[ 4 ], trial_rot_value_2, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 6 ] ^= ROTL_W( test_array[ 6 ] ^ test_array[ 5 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 7 ] ^= ROTL_W( test_array[ 7 ] ^ test_array[ 6 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );

							test_array[ 0 ] ^= ROTL_W( test_array[ 0 ] ^ test_array[ 7 ], trial_rot_value_2, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 1 ] ^= ROTL_W( test_array[ 1 ] ^ test_array[ 0 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 2 ] ^= ROTL_W( test_array[ 2 ] ^ test_array[ 1 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 3 ] ^= ROTL_W( test_array[ 3 ] ^ test_array[ 2 ], trial_rot_value_2, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 4 ] ^= ROTL_W( test_array[ 4 ] ^ test_array[ 3 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 5 ] ^= ROTL_W( test_array[ 5 ] ^ test_array[ 4 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 6 ] ^= ROTL_W( test_array[ 6 ] ^ test_array[ 5 ], trial_rot_value_2, WORD_BITS_64, WORD_MODULUS_64 );
							test_array[ 7 ] ^= ROTL_W( test_array[ 7 ] ^ test_array[ 6 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );


							byte_hamming_weight = get__byte_hamming_weight__quad( test_array );
							bit_hamming_weight = get__bit_hamming_weight__quad( test_array );

		//					printf("[ %u, %u, %u, %u ], [ %u, %u ] :-> [ %u, %u ]\n", trial_rot_value_0, trial_rot_value_1, trial_rot_value_2, trial_rot_value_3, seed_word_loop_counter, seed_word_bit_loop_counter, byte_hamming_weight, bit_hamming_weight );

							if ( byte_hamming_weight < byte_hamming_weight_threshold ) meet_threshold_flag = 0;
							if ( bit_hamming_weight < bit_hamming_weight_threshold ) meet_threshold_flag = 0;
							if ( byte_hamming_weight < trailing_byte_threshold ) meet_threshold_flag = 0;

							if ( byte_hamming_weight < current_min_byte_hamming_weight ) current_min_byte_hamming_weight = byte_hamming_weight;
							if ( bit_hamming_weight < current_min_bit_hamming_weight ) current_min_bit_hamming_weight = bit_hamming_weight;

							if ( byte_hamming_weight > current_max_byte_hamming_weight ) current_max_byte_hamming_weight = byte_hamming_weight;
							if ( bit_hamming_weight > current_max_bit_hamming_weight ) current_max_bit_hamming_weight = bit_hamming_weight;

						}

					}

//					printf("Processing %u, %u, %u, %u : mins = %u, %u \n", trial_rot_value_0, trial_rot_value_1, trial_rot_value_2, trial_rot_value_3, current_min_byte_hamming_weight, current_min_bit_hamming_weight );
					if ( meet_threshold_flag == 1 ) printf("%u,%u, %u, %u : [ %u,%u ], [ %u,%u ]\n", trial_rot_value_0, trial_rot_value_1, trial_rot_value_2, trial_rot_value_3, current_min_byte_hamming_weight, current_min_bit_hamming_weight, current_max_byte_hamming_weight, current_max_bit_hamming_weight );
					if ( current_min_byte_hamming_weight > trailing_byte_threshold ) trailing_byte_threshold += 1;
					/* **** End test **** */

//				}

			}

		}

	}

}


void do__find_optimal_quad_diffuse_rotate_constants__with_template( u32 byte_hamming_weight_threshold, u32 bit_hamming_weight_threshold, u64 template_array[ PRELIMINARY_KEY__64_BIT_WORDS ] ) {

	u32 seed_word_loop_counter, seed_word_bit_loop_counter, trial_rot_value_0, trial_rot_value_1;	
	u64 test_array[ 8 ];
	u32 byte_hamming_weight, bit_hamming_weight;
	u32 meet_threshold_flag;
	u32 current_min_byte_hamming_weight, current_min_bit_hamming_weight;
	u32 current_max_byte_hamming_weight, current_max_bit_hamming_weight;


	trial_rot_value_1 = 0;
	for ( trial_rot_value_0 = 1; trial_rot_value_0 < 64; trial_rot_value_0++ ) {

		for ( trial_rot_value_1 = 1; trial_rot_value_1 < 64; trial_rot_value_1++ ) {

			meet_threshold_flag = 1;
			current_min_byte_hamming_weight = 64;
			current_min_bit_hamming_weight = 512;
			current_max_byte_hamming_weight = 0;
			current_max_bit_hamming_weight = 0;

			for ( seed_word_loop_counter = 0; seed_word_loop_counter < 8; seed_word_loop_counter++ ) {

				for ( seed_word_bit_loop_counter = 0; seed_word_bit_loop_counter < 64; seed_word_bit_loop_counter++ ) {

					memcpy( test_array, template_array, 64 );						

					test_array[ seed_word_loop_counter ] ^= 1LLU << ( seed_word_bit_loop_counter );

					test_array[ 0 ] += ROTL_W( test_array[ 0 ] ^ test_array[ 7 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 1 ] += ROTL_W( test_array[ 1 ] ^ test_array[ 0 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 2 ] += ROTL_W( test_array[ 2 ] ^ test_array[ 1 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 3 ] += ROTL_W( test_array[ 3 ] ^ test_array[ 2 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 4 ] += ROTL_W( test_array[ 4 ] ^ test_array[ 3 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 5 ] += ROTL_W( test_array[ 5 ] ^ test_array[ 4 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 6 ] += ROTL_W( test_array[ 6 ] ^ test_array[ 5 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 7 ] += ROTL_W( test_array[ 7 ] ^ test_array[ 6 ], trial_rot_value_0, WORD_BITS_64, WORD_MODULUS_64 );

					test_array[ 0 ] ^= ROTL_W( test_array[ 0 ] + test_array[ 7 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 1 ] ^= ROTL_W( test_array[ 1 ] + test_array[ 0 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 2 ] ^= ROTL_W( test_array[ 2 ] + test_array[ 1 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 3 ] ^= ROTL_W( test_array[ 3 ] + test_array[ 2 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 4 ] ^= ROTL_W( test_array[ 4 ] + test_array[ 3 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 5 ] ^= ROTL_W( test_array[ 5 ] + test_array[ 4 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 6 ] ^= ROTL_W( test_array[ 6 ] + test_array[ 5 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );
					test_array[ 7 ] ^= ROTL_W( test_array[ 7 ] + test_array[ 6 ], trial_rot_value_1, WORD_BITS_64, WORD_MODULUS_64 );

					/* XOR test_array with template - anything not zero has been changed */
					test_array[ 0 ] ^= template_array[ 0 ];
					test_array[ 1 ] ^= template_array[ 1 ];
					test_array[ 2 ] ^= template_array[ 2 ];
					test_array[ 3 ] ^= template_array[ 3 ];
					test_array[ 4 ] ^= template_array[ 4 ];
					test_array[ 5 ] ^= template_array[ 5 ];
					test_array[ 6 ] ^= template_array[ 6 ];
					test_array[ 7 ] ^= template_array[ 7 ];


					byte_hamming_weight = get__byte_hamming_weight__quad( test_array );
					bit_hamming_weight = get__bit_hamming_weight__quad( test_array );

//					printf("[ %u, %u ], [ %u, %u ] :-> [ %u, %u ]\n", trial_rot_value_0, trial_rot_value_1, seed_word_loop_counter, seed_word_bit_loop_counter, byte_hamming_weight, bit_hamming_weight );

					if ( byte_hamming_weight < byte_hamming_weight_threshold ) meet_threshold_flag = 0;
					if ( bit_hamming_weight < bit_hamming_weight_threshold ) meet_threshold_flag = 0;

					if ( byte_hamming_weight < current_min_byte_hamming_weight ) current_min_byte_hamming_weight = byte_hamming_weight;
					if ( bit_hamming_weight < current_min_bit_hamming_weight ) current_min_bit_hamming_weight = bit_hamming_weight;

					if ( byte_hamming_weight > current_max_byte_hamming_weight ) current_max_byte_hamming_weight = byte_hamming_weight;
					if ( bit_hamming_weight > current_max_bit_hamming_weight ) current_max_bit_hamming_weight = bit_hamming_weight;

				}

			}

//			printf("Processing %u, %u : mins = %u, %u \n", trial_rot_value_0, trial_rot_value_1, current_min_byte_hamming_weight, current_min_bit_hamming_weight );
			if ( meet_threshold_flag == 1 ) printf("%u,%u : [ %u,%u ], [ %u,%u ]\n", trial_rot_value_0, trial_rot_value_1, current_min_byte_hamming_weight, current_min_bit_hamming_weight, current_max_byte_hamming_weight, current_max_bit_hamming_weight );

		}

	}

}






/*
 * Sbox and MDS table export functions
 */

void do__dump_sbox_set( u8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] ) {

	u32 loop_counter_a, loop_counter_b;

	printf("\n\nu8 sbox_set[ SBOX_SET__SIZE ][ SBOX__SIZE ] = {\n");
	for ( loop_counter_a = 0; loop_counter_a < SBOX_SET__SIZE; loop_counter_a++ ) {

		printf("\t{ /* Sbox %u*/", loop_counter_a );
		for ( loop_counter_b = 0; loop_counter_b < SBOX__SIZE ; loop_counter_b++ ) {

			if ( ( loop_counter_b % 16 ) == 0 ) printf("\n\t\t");

			if ( loop_counter_b != ( SBOX__SIZE - 1 ) ) {
	
				printf("0x%02x, ", sbox_set[ loop_counter_a ][ loop_counter_b ] );

			} else {

				printf("0x%02x\n", sbox_set[ loop_counter_a ][ loop_counter_b ] );

			}


		}

		if ( loop_counter_a != ( SBOX_SET__SIZE - 1 ) ) {

			printf("\t},\n");
		
		} else {

			printf("\t}\n};\n\n");

		}

	}

}


void do__dump_mds_8x8s_table( u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] ) {

	u32 loop_counter_index, loop_counter_sbox;

	printf( "\n\nmds_8x8s_0[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ] = {\n" );

	for ( loop_counter_index = 0; loop_counter_index < MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE; loop_counter_index++ ) {

		printf("\t{ /* index %u */\n", loop_counter_index );
		for ( loop_counter_sbox = 0; loop_counter_sbox < MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE; loop_counter_sbox++ ) {

			printf("\t\t0x%ll016xLLU,\n", mds_8x8s[ loop_counter_index ][ loop_counter_sbox ] );

		}

		if ( loop_counter_index != ( MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE - 1 ) ) {

			printf("\t},\n" );

		} else {

			printf("\t}\n" );
		
		}

	}

	printf( "};\n\n");

}


void do__dump_mds_16x8s_table( u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], u64 mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] ) {

	u32 loop_counter_index, loop_counter_sbox;


	/* Do lhs */
	printf( "\n\nmds_16x8s_lhs_0[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] = {\n" );

	for ( loop_counter_index = 0; loop_counter_index < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; loop_counter_index++ ) {

		printf("\t{ /* index %u */\n", loop_counter_index );
		for ( loop_counter_sbox = 0; loop_counter_sbox < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; loop_counter_sbox++ ) {

			printf("\t\t0x%ll016xLLU,\n", mds_16x8s_lhs[ loop_counter_index ][ loop_counter_sbox ] );

		}

		if ( loop_counter_index != ( MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE - 1 ) ) {

			printf("\t},\n" );

		} else {

			printf("\t}\n" );
		
		}

	}

	printf( "};\n\n");


	/* Do rhs */
	printf( "\n\nmds_16x8s_rhs_0[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ] = {\n" );

	for ( loop_counter_index = 0; loop_counter_index < MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE; loop_counter_index++ ) {

		printf("\t{ /* index %u */\n", loop_counter_index );
		for ( loop_counter_sbox = 0; loop_counter_sbox < MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE; loop_counter_sbox++ ) {

			printf("\t\t0x%ll016xLLU,\n", mds_16x8s_rhs[ loop_counter_index ][ loop_counter_sbox ] );

		}

		if ( loop_counter_index != ( MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE - 1 ) ) {

			printf("\t},\n" );

		} else {

			printf("\t}\n" );
		
		}

	}

	printf( "};\n\n");

}







int main() {

	u32 loop_counter, loop_counter_b, loop_counter_c;
	u8 major_cauchy_matrix[ MAJOR_CAUCHY_MATRIX__DIMENSION ][ MAJOR_CAUCHY_MATRIX__DIMENSION ];
	u8 minor_cauchy_matrix[ MINOR_CAUCHY_MATRIX__DIMENSION ][ MINOR_CAUCHY_MATRIX__DIMENSION ];
	u8 ff_array_x1b[ GF__SIZE ][ GF__SIZE ];
	u8 ff_array_x1d[ GF__SIZE ][ GF__SIZE ];	
	u64 mds_8x8s[ MDS__8BIT_X_64BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_64BIT_TABLE_SBOX__SIZE ];
	u64 mds_16x8s_lhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ], mds_16x8s_rhs[ MDS__8BIT_X_128BIT_TABLE_INDEX__SIZE ][ MDS__8BIT_X_128BIT_TABLE_SBOX__SIZE ];
	u64 state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 out_state_array[ SGAIL__NUM_64_BIT_WORDS ];
	u64 round_key[ SGAIL__NUM_64_BIT_WORDS ];
	u64 key_array[ SGAIL__NUM_64_BIT_WORDS ];
	u8  xlate_array[ SGAIL__STATE__SIZE ];
	u64 principle_key__1x_blocks[ SGAIL__NUM_64_BIT_WORDS ];
	u64 secret_key[ SECRET_KEY__64_BIT_WORDS ];
	u64 current_block[ 2 ];
	u64 preliminary_key[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u64 test_message_block[ SGAIL__NUM_64_BIT_WORDS ];
	hashState *state;
	BitSequence *input_data;
	BitSequence *hash_value;
	HashReturn hash_result;
	u64 template_array[ PRELIMINARY_KEY__64_BIT_WORDS ];
	u8  hash_224[ DIGEST__224_BITS__BYTE_LENGTH ];
	u8  hash_256[ DIGEST__256_BITS__BYTE_LENGTH ];
	u8  hash_384[ DIGEST__384_BITS__BYTE_LENGTH ];
	u8  hash_512[ DIGEST__512_BITS__BYTE_LENGTH ];
	u8  hash_768[ DIGEST__768_BITS__BYTE_LENGTH ];
	u8  hash_1024[ DIGEST__1024_BITS__BYTE_LENGTH ];
	u8  hash_1536[ DIGEST__1536_BITS__BYTE_LENGTH ];
	u8  hash_2048[ DIGEST__2048_BITS__BYTE_LENGTH ];

	
	
	input_data = (BitSequence *)malloc( 10000 );
	hash_value = (BitSequence *)malloc( 256 );

		memset( hash_224, 0, DIGEST__224_BITS__BYTE_LENGTH  );
		memset( hash_256, 0, DIGEST__256_BITS__BYTE_LENGTH  );
		memset( hash_384, 0, DIGEST__384_BITS__BYTE_LENGTH  );
		memset( hash_512, 0, DIGEST__512_BITS__BYTE_LENGTH  );
		memset( hash_768, 0, DIGEST__768_BITS__BYTE_LENGTH  );
		memset( hash_1024, 0, DIGEST__1024_BITS__BYTE_LENGTH  );
		memset( hash_1536, 0, DIGEST__1536_BITS__BYTE_LENGTH  );
		memset( hash_2048, 0, DIGEST__2048_BITS__BYTE_LENGTH  );

		printf("Intermedite hash values for 224, 256, 384, 512, 768, 1024, 1536, and 2048 bits\n");
		printf("1 Message Block, 'AAAAA'\n");
		printf("-------------------------------------------------------------------\n\n\n\n");


		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 224 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__224_BITS, input_data, 5 * 8 , hash_224 );
		do__display_224_bit_hash__byte_wise( hash_224 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 256 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__256_BITS, input_data, 5 * 8 , hash_256 );
		do__display_256_bit_hash__byte_wise( hash_256 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 384 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__384_BITS, input_data, 5 * 8 , hash_384 );
		do__display_384_bit_hash__byte_wise( hash_384 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 512 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__512_BITS, input_data, 5 * 8 , hash_512 );
		do__display_512_bit_hash__byte_wise( hash_512 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 768 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__768_BITS, input_data, 5 * 8 , hash_768 );
		do__display_768_bit_hash__byte_wise( hash_768 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 1024 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__1024_BITS, input_data, 5 * 8 , hash_1024 );
		do__display_1024_bit_hash__byte_wise( hash_1024 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 1536 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__1536_BITS, input_data, 5 * 8 , hash_1536 );
		do__display_1536_bit_hash__byte_wise( hash_1536 );
	
		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 2048 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 5 );
		Hash( DIGEST__2048_BITS, input_data, 5 * 8 , hash_2048 );
		do__display_2048_bit_hash__byte_wise( hash_2048 );



/*
		memset( hash_224, 0, DIGEST__224_BITS__BYTE_LENGTH  );
		memset( hash_256, 0, DIGEST__256_BITS__BYTE_LENGTH  );
		memset( hash_384, 0, DIGEST__384_BITS__BYTE_LENGTH  );
		memset( hash_512, 0, DIGEST__512_BITS__BYTE_LENGTH  );
		memset( hash_768, 0, DIGEST__768_BITS__BYTE_LENGTH  );
		memset( hash_1024, 0, DIGEST__1024_BITS__BYTE_LENGTH  );
		memset( hash_1536, 0, DIGEST__1536_BITS__BYTE_LENGTH  );
		memset( hash_2048, 0, DIGEST__2048_BITS__BYTE_LENGTH  );

		printf("Intermedite hash values for 224, 256, 384, 512, 768, 1024, 1536, and 2048 bits\n");
		printf("2 Message Blocks, 600 x 'A'\n");
		printf("-------------------------------------------------------------------\n\n\n\n");


		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 224 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__224_BITS, input_data, 600 * 8 , hash_224 );
		do__display_224_bit_hash__byte_wise( hash_224 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 256 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__256_BITS, input_data, 600 * 8 , hash_256 );
		do__display_256_bit_hash__byte_wise( hash_256 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 384 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__384_BITS, input_data, 600 * 8 , hash_384 );
		do__display_384_bit_hash__byte_wise( hash_384 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 512 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__512_BITS, input_data, 600 * 8 , hash_512 );
		do__display_512_bit_hash__byte_wise( hash_512 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 768 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__768_BITS, input_data, 600 * 8 , hash_768 );
		do__display_768_bit_hash__byte_wise( hash_768 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 1024 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__1024_BITS, input_data, 600 * 8 , hash_1024 );
		do__display_1024_bit_hash__byte_wise( hash_1024 );

		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 1536 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__1536_BITS, input_data, 600 * 8 , hash_1536 );
		do__display_1536_bit_hash__byte_wise( hash_1536 );
	
		printf("-------------------------------------------------------------------\n\n\n\n");
		printf("Starting 2048 bits hash\n");
		printf("-------------------------------------------------------------------\n\n\n\n");
		memset( input_data, 0, 10000 );
		memset( input_data, 'A', 600 );
		Hash( DIGEST__2048_BITS, input_data, 600 * 8 , hash_2048 );
		do__display_2048_bit_hash__byte_wise( hash_2048 );
*/


	free( hash_value );
	free( input_data );





	return(0);

}








