diff --git a/CMakeLists.txt b/CMakeLists.txt index 9f6b43a8..ce28f3ef 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,6 +21,17 @@ else (WIN32) IF (CMAKE_SYSTEM_PROCESSOR MATCHES "^(i.86|x86(_64)?)$") set (PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -msse4") set (BLAKE2_IMPLEMENTATION "blake2/blake2b.c") + if (ENABLE_AES) + set (PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -maes") + endif() + if (ENABLE_AVX2) + set (PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -mavx2") + if (PERMUTE_WITH_GATHER) + set (PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DPERMUTE_WITH_GATHER") + elseif (PERMUTE_WITH_SHUFFLES) + set (PLATFORM_COMPILE_FLAGS "${PLATFORM_COMPILE_FLAGS} -DPERMUTE_WITH_SHUFFLES") + endif() + endif() else() set (BLAKE2_IMPLEMENTATION "blake2/blake2b-ref.c") endif() diff --git a/blake2/blake2-config.h b/blake2/blake2-config.h index 70d61f10..12af5a71 100644 --- a/blake2/blake2-config.h +++ b/blake2/blake2-config.h @@ -31,6 +31,10 @@ #define HAVE_AVX #endif +#if defined(__AVX2__) +#define HAVE_AVX2 +#endif + #if defined(__XOP__) #define HAVE_XOP #endif diff --git a/blake2/blake2b-compress-avx2.h b/blake2/blake2b-compress-avx2.h new file mode 100644 index 00000000..3deeece8 --- /dev/null +++ b/blake2/blake2b-compress-avx2.h @@ -0,0 +1,207 @@ +/* +https://github.com/sneves/blake2-avx2 +https://github.com/jedisct1/libsodium/ +*/ + +#ifndef blake2b_compress_avx2_H +#define blake2b_compress_avx2_H + +#define LOAD128(p) _mm_load_si128((__m128i *) (p)) +#define STORE128(p, r) _mm_store_si128((__m128i *) (p), r) + +#define LOADU128(p) _mm_loadu_si128((__m128i *) (p)) +#define STOREU128(p, r) _mm_storeu_si128((__m128i *) (p), r) + +#define LOAD(p) _mm256_load_si256((__m256i *) (p)) +#define STORE(p, r) _mm256_store_si256((__m256i *) (p), r) + +#define LOADU(p) _mm256_loadu_si256((__m256i *) (p)) +#define STOREU(p, r) _mm256_storeu_si256((__m256i *) (p), r) + +static inline uint64_t +LOADU64(const void *p) +{ + uint64_t v; + memcpy(&v, p, sizeof v); + return v; +} + +#define ROTATE16 \ + _mm256_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9, 2, \ + 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9) + +#define ROTATE24 \ + _mm256_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10, 3, \ + 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10) + +#define ADD(a, b) _mm256_add_epi64(a, b) +#define SUB(a, b) _mm256_sub_epi64(a, b) + +#define XOR(a, b) _mm256_xor_si256(a, b) +#define AND(a, b) _mm256_and_si256(a, b) +#define OR(a, b) _mm256_or_si256(a, b) + +#define ROT32(x) _mm256_shuffle_epi32((x), _MM_SHUFFLE(2, 3, 0, 1)) +#define ROT24(x) _mm256_shuffle_epi8((x), ROTATE24) +#define ROT16(x) _mm256_shuffle_epi8((x), ROTATE16) +#define ROT63(x) _mm256_or_si256(_mm256_srli_epi64((x), 63), ADD((x), (x))) + +#define BLAKE2B_G1_V1(a, b, c, d, m) \ + do { \ + a = ADD(a, m); \ + a = ADD(a, b); \ + d = XOR(d, a); \ + d = ROT32(d); \ + c = ADD(c, d); \ + b = XOR(b, c); \ + b = ROT24(b); \ + } while (0) + +#define BLAKE2B_G2_V1(a, b, c, d, m) \ + do { \ + a = ADD(a, m); \ + a = ADD(a, b); \ + d = XOR(d, a); \ + d = ROT16(d); \ + c = ADD(c, d); \ + b = XOR(b, c); \ + b = ROT63(b); \ + } while (0) + +#define BLAKE2B_DIAG_V1(a, b, c, d) \ + do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \ + } while (0) + +#define BLAKE2B_UNDIAG_V1(a, b, c, d) \ + do { \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ + b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \ + } while (0) + +#if defined(PERMUTE_WITH_SHUFFLES) + #include "blake2b-load-avx2.h" +#elif defined(PERMUTE_WITH_GATHER) +#else + #include "blake2b-load-avx2-simple.h" +#endif + +#if defined(PERMUTE_WITH_GATHER) +ALIGN(64) static const uint32_t indices[12][16] = { + { 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13,10, 8, 15, 6, 1, 0, 11, 5,12, 2, 7, 3}, + {11, 12, 5, 15, 8, 0, 2, 13,10, 3, 7, 9,14, 6, 1, 4}, + { 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, + { 9, 5, 2, 10, 0, 7, 4, 15,14, 11, 6, 3, 1, 12, 8, 13}, + { 2, 6, 0, 8,12, 10, 11, 3, 4, 7, 15, 1,13, 5, 14, 9}, + {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, + {13, 7, 12, 3,11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, + { 6, 14, 11, 0,15, 9, 3, 8,12, 13, 1, 10, 2, 7, 4, 5}, + {10, 8, 7, 1, 2, 4, 6, 5,15, 9, 3, 13,11, 14, 12, 0}, + { 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, + {14, 4, 9, 13,10, 8, 15, 6, 1, 0, 11, 5,12, 2, 7, 3}, +}; + +#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ + __m256i b0; \ + b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 0]), 8); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 4]), 8); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_DIAG_V1(a, b, c, d); \ + b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][ 8]), 8); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + b0 = _mm256_i32gather_epi64((void *)(m), LOAD128(&indices[r][12]), 8); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_UNDIAG_V1(a, b, c, d); \ +} while(0) + +#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \ + int i; \ + for(i = 0; i < 12; ++i) { \ + BLAKE2B_ROUND_V1(a, b, c, d, i, m); \ + } \ +} while(0) +#else /* !PERMUTE_WITH_GATHER */ +#define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ + __m256i b0; \ + BLAKE2B_LOAD_MSG_ ##r ##_1(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_2(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_DIAG_V1(a, b, c, d); \ + BLAKE2B_LOAD_MSG_ ##r ##_3(b0); \ + BLAKE2B_G1_V1(a, b, c, d, b0); \ + BLAKE2B_LOAD_MSG_ ##r ##_4(b0); \ + BLAKE2B_G2_V1(a, b, c, d, b0); \ + BLAKE2B_UNDIAG_V1(a, b, c, d); \ +} while(0) + +#define BLAKE2B_ROUNDS_V1(a, b, c, d, m) do { \ + BLAKE2B_ROUND_V1(a, b, c, d, 0, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 1, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 2, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 3, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 4, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 5, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 6, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 7, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 8, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 9, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 10, (m)); \ + BLAKE2B_ROUND_V1(a, b, c, d, 11, (m)); \ +} while(0) +#endif + +#if defined(PERMUTE_WITH_GATHER) +#define DECLARE_MESSAGE_WORDS(m) +#elif defined(PERMUTE_WITH_SHUFFLES) +#define DECLARE_MESSAGE_WORDS(m) \ + const __m256i m0 = _mm256_broadcastsi128_si256(LOADU128((m) + 0)); \ + const __m256i m1 = _mm256_broadcastsi128_si256(LOADU128((m) + 16)); \ + const __m256i m2 = _mm256_broadcastsi128_si256(LOADU128((m) + 32)); \ + const __m256i m3 = _mm256_broadcastsi128_si256(LOADU128((m) + 48)); \ + const __m256i m4 = _mm256_broadcastsi128_si256(LOADU128((m) + 64)); \ + const __m256i m5 = _mm256_broadcastsi128_si256(LOADU128((m) + 80)); \ + const __m256i m6 = _mm256_broadcastsi128_si256(LOADU128((m) + 96)); \ + const __m256i m7 = _mm256_broadcastsi128_si256(LOADU128((m) + 112)); \ + __m256i t0, t1; +#else +#define DECLARE_MESSAGE_WORDS(m) \ + const uint64_t m0 = LOADU64((m) + 0); \ + const uint64_t m1 = LOADU64((m) + 8); \ + const uint64_t m2 = LOADU64((m) + 16); \ + const uint64_t m3 = LOADU64((m) + 24); \ + const uint64_t m4 = LOADU64((m) + 32); \ + const uint64_t m5 = LOADU64((m) + 40); \ + const uint64_t m6 = LOADU64((m) + 48); \ + const uint64_t m7 = LOADU64((m) + 56); \ + const uint64_t m8 = LOADU64((m) + 64); \ + const uint64_t m9 = LOADU64((m) + 72); \ + const uint64_t m10 = LOADU64((m) + 80); \ + const uint64_t m11 = LOADU64((m) + 88); \ + const uint64_t m12 = LOADU64((m) + 96); \ + const uint64_t m13 = LOADU64((m) + 104); \ + const uint64_t m14 = LOADU64((m) + 112); \ + const uint64_t m15 = LOADU64((m) + 120); +#endif + +#define BLAKE2B_COMPRESS_V1(a, b, m, t0, t1, f0, f1) \ + do { \ + DECLARE_MESSAGE_WORDS(m) \ + const __m256i iv0 = a; \ + const __m256i iv1 = b; \ + __m256i c = LOAD(&blake2b_IV[0]); \ + __m256i d = \ + XOR(LOAD(&blake2b_IV[4]), _mm256_set_epi64x(f1, f0, t1, t0)); \ + BLAKE2B_ROUNDS_V1(a, b, c, d, m); \ + a = XOR(a, c); \ + b = XOR(b, d); \ + a = XOR(a, iv0); \ + b = XOR(b, iv1); \ + } while (0) + +#endif diff --git a/blake2/blake2b-load-avx2-simple.h b/blake2/blake2b-load-avx2-simple.h new file mode 100644 index 00000000..2d6fa01e --- /dev/null +++ b/blake2/blake2b-load-avx2-simple.h @@ -0,0 +1,54 @@ +#ifndef BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_SIMPLE_H +#define BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_SIMPLE_H + +#define BLAKE2B_LOAD_MSG_0_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0); +#define BLAKE2B_LOAD_MSG_0_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1); +#define BLAKE2B_LOAD_MSG_0_3(b0) b0 = _mm256_set_epi64x(m14, m12, m10, m8); +#define BLAKE2B_LOAD_MSG_0_4(b0) b0 = _mm256_set_epi64x(m15, m13, m11, m9); +#define BLAKE2B_LOAD_MSG_1_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14); +#define BLAKE2B_LOAD_MSG_1_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10); +#define BLAKE2B_LOAD_MSG_1_3(b0) b0 = _mm256_set_epi64x(m5, m11, m0, m1); +#define BLAKE2B_LOAD_MSG_1_4(b0) b0 = _mm256_set_epi64x(m3, m7, m2, m12); +#define BLAKE2B_LOAD_MSG_2_1(b0) b0 = _mm256_set_epi64x(m15, m5, m12, m11); +#define BLAKE2B_LOAD_MSG_2_2(b0) b0 = _mm256_set_epi64x(m13, m2, m0, m8); +#define BLAKE2B_LOAD_MSG_2_3(b0) b0 = _mm256_set_epi64x(m9, m7, m3, m10); +#define BLAKE2B_LOAD_MSG_2_4(b0) b0 = _mm256_set_epi64x(m4, m1, m6, m14); +#define BLAKE2B_LOAD_MSG_3_1(b0) b0 = _mm256_set_epi64x(m11, m13, m3, m7); +#define BLAKE2B_LOAD_MSG_3_2(b0) b0 = _mm256_set_epi64x(m14, m12, m1, m9); +#define BLAKE2B_LOAD_MSG_3_3(b0) b0 = _mm256_set_epi64x(m15, m4, m5, m2); +#define BLAKE2B_LOAD_MSG_3_4(b0) b0 = _mm256_set_epi64x(m8, m0, m10, m6); +#define BLAKE2B_LOAD_MSG_4_1(b0) b0 = _mm256_set_epi64x(m10, m2, m5, m9); +#define BLAKE2B_LOAD_MSG_4_2(b0) b0 = _mm256_set_epi64x(m15, m4, m7, m0); +#define BLAKE2B_LOAD_MSG_4_3(b0) b0 = _mm256_set_epi64x(m3, m6, m11, m14); +#define BLAKE2B_LOAD_MSG_4_4(b0) b0 = _mm256_set_epi64x(m13, m8, m12, m1); +#define BLAKE2B_LOAD_MSG_5_1(b0) b0 = _mm256_set_epi64x(m8, m0, m6, m2); +#define BLAKE2B_LOAD_MSG_5_2(b0) b0 = _mm256_set_epi64x(m3, m11, m10, m12); +#define BLAKE2B_LOAD_MSG_5_3(b0) b0 = _mm256_set_epi64x(m1, m15, m7, m4); +#define BLAKE2B_LOAD_MSG_5_4(b0) b0 = _mm256_set_epi64x(m9, m14, m5, m13); +#define BLAKE2B_LOAD_MSG_6_1(b0) b0 = _mm256_set_epi64x(m4, m14, m1, m12); +#define BLAKE2B_LOAD_MSG_6_2(b0) b0 = _mm256_set_epi64x(m10, m13, m15, m5); +#define BLAKE2B_LOAD_MSG_6_3(b0) b0 = _mm256_set_epi64x(m8, m9, m6, m0); +#define BLAKE2B_LOAD_MSG_6_4(b0) b0 = _mm256_set_epi64x(m11, m2, m3, m7); +#define BLAKE2B_LOAD_MSG_7_1(b0) b0 = _mm256_set_epi64x(m3, m12, m7, m13); +#define BLAKE2B_LOAD_MSG_7_2(b0) b0 = _mm256_set_epi64x(m9, m1, m14, m11); +#define BLAKE2B_LOAD_MSG_7_3(b0) b0 = _mm256_set_epi64x(m2, m8, m15, m5); +#define BLAKE2B_LOAD_MSG_7_4(b0) b0 = _mm256_set_epi64x(m10, m6, m4, m0); +#define BLAKE2B_LOAD_MSG_8_1(b0) b0 = _mm256_set_epi64x(m0, m11, m14, m6); +#define BLAKE2B_LOAD_MSG_8_2(b0) b0 = _mm256_set_epi64x(m8, m3, m9, m15); +#define BLAKE2B_LOAD_MSG_8_3(b0) b0 = _mm256_set_epi64x(m10, m1, m13, m12); +#define BLAKE2B_LOAD_MSG_8_4(b0) b0 = _mm256_set_epi64x(m5, m4, m7, m2); +#define BLAKE2B_LOAD_MSG_9_1(b0) b0 = _mm256_set_epi64x(m1, m7, m8, m10); +#define BLAKE2B_LOAD_MSG_9_2(b0) b0 = _mm256_set_epi64x(m5, m6, m4, m2); +#define BLAKE2B_LOAD_MSG_9_3(b0) b0 = _mm256_set_epi64x(m13, m3, m9, m15); +#define BLAKE2B_LOAD_MSG_9_4(b0) b0 = _mm256_set_epi64x(m0, m12, m14, m11); +#define BLAKE2B_LOAD_MSG_10_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0); +#define BLAKE2B_LOAD_MSG_10_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1); +#define BLAKE2B_LOAD_MSG_10_3(b0) b0 = _mm256_set_epi64x(m14, m12, m10, m8); +#define BLAKE2B_LOAD_MSG_10_4(b0) b0 = _mm256_set_epi64x(m15, m13, m11, m9); +#define BLAKE2B_LOAD_MSG_11_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14); +#define BLAKE2B_LOAD_MSG_11_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10); +#define BLAKE2B_LOAD_MSG_11_3(b0) b0 = _mm256_set_epi64x(m5, m11, m0, m1); +#define BLAKE2B_LOAD_MSG_11_4(b0) b0 = _mm256_set_epi64x(m3, m7, m2, m12); + +#endif + diff --git a/blake2/blake2b-load-avx2.h b/blake2/blake2b-load-avx2.h new file mode 100644 index 00000000..c865c9ad --- /dev/null +++ b/blake2/blake2b-load-avx2.h @@ -0,0 +1,340 @@ +#ifndef BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H +#define BLAKE2_AVX2_BLAKE2B_LOAD_AVX2_H + +#define BLAKE2B_LOAD_MSG_0_1(b0) do { \ + t0 = _mm256_unpacklo_epi64(m0, m1); \ + t1 = _mm256_unpacklo_epi64(m2, m3); \ + b0 = _mm256_blend_epi32(t0, t1, 0xF0); \ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m5);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_0_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m5);\ +t1 = _mm256_unpackhi_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_3(b0) \ +do { \ +t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ +t1 = _mm256_unpackhi_epi64(m5, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_1_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m1);\ +t1 = _mm256_unpackhi_epi64(m3, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_1(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m2, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m0);\ +t1 = _mm256_blend_epi32(m6, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m1, m5, 0x33);\ +t1 = _mm256_unpackhi_epi64(m3, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_2_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m3);\ +t1 = _mm256_alignr_epi8(m2, m0, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m3, m1);\ +t1 = _mm256_unpackhi_epi64(m6, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m0);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m2, m1, 0x33);\ +t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_3_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m3, m5);\ +t1 = _mm256_unpacklo_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m2);\ +t1 = _mm256_unpacklo_epi64(m1, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_2(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m0, 0x33);\ +t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m5, m7, 0x33);\ +t1 = _mm256_blend_epi32(m1, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_4_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m6, m0, 8);\ +t1 = _mm256_blend_epi32(m6, m4, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m3);\ +t1 = _mm256_unpacklo_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m5);\ +t1 = _mm256_unpackhi_epi64(m5, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_3(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m2, 0x33);\ +t1 = _mm256_unpackhi_epi64(m7, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_5_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m2);\ +t1 = _mm256_blend_epi32(m4, m7, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_1(b0) \ +do { \ +t0 = _mm256_blend_epi32(m0, m6, 0x33);\ +t1 = _mm256_unpacklo_epi64(m7, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m7);\ +t1 = _mm256_alignr_epi8(m5, m6, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m3);\ +t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_6_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m3, m1);\ +t1 = _mm256_blend_epi32(m5, m1, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_1(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m6, m3);\ +t1 = _mm256_blend_epi32(m1, m6, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_2(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m5, 8);\ +t1 = _mm256_unpackhi_epi64(m0, m4);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m2, m7);\ +t1 = _mm256_unpacklo_epi64(m4, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_7_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m2);\ +t1 = _mm256_unpacklo_epi64(m3, m5);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m3, m7);\ +t1 = _mm256_alignr_epi8(m0, m5, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_alignr_epi8(m4, m1, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_3(b0) \ +do { \ +t0 = m6;\ +t1 = _mm256_alignr_epi8(m5, m0, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_8_4(b0) \ +do { \ +t0 = _mm256_blend_epi32(m3, m1, 0x33);\ +t1 = m2;\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_unpackhi_epi64(m3, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m1, m2);\ +t1 = _mm256_blend_epi32(m2, m3, 0x33);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_3(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m1, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_9_4(b0) \ +do { \ +t0 = _mm256_alignr_epi8(m7, m5, 8);\ +t1 = _mm256_unpacklo_epi64(m6, m0);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m0, m1);\ +t1 = _mm256_unpacklo_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_2(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m0, m1);\ +t1 = _mm256_unpackhi_epi64(m2, m3);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_3(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m4, m5);\ +t1 = _mm256_unpacklo_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_10_4(b0) \ +do { \ +t0 = _mm256_unpackhi_epi64(m4, m5);\ +t1 = _mm256_unpackhi_epi64(m6, m7);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_1(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m7, m2);\ +t1 = _mm256_unpackhi_epi64(m4, m6);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_2(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m5, m4);\ +t1 = _mm256_alignr_epi8(m3, m7, 8);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_3(b0) \ +do { \ +t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ +t1 = _mm256_unpackhi_epi64(m5, m2);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#define BLAKE2B_LOAD_MSG_11_4(b0) \ +do { \ +t0 = _mm256_unpacklo_epi64(m6, m1);\ +t1 = _mm256_unpackhi_epi64(m3, m1);\ +b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ +} while(0) + +#endif + diff --git a/blake2/blake2b.c b/blake2/blake2b.c index 30df1a4e..d4cbdc11 100644 --- a/blake2/blake2b.c +++ b/blake2/blake2b.c @@ -37,7 +37,11 @@ #include #endif +#if defined(HAVE_AVX2) +#include "blake2b-compress-avx2.h" +#else #include "blake2b-round.h" +#endif ALIGN( 64 ) static const uint64_t blake2b_IV[8] = { @@ -47,22 +51,6 @@ ALIGN( 64 ) static const uint64_t blake2b_IV[8] = 0x1f83d9abfb41bd6bULL, 0x5be0cd19137e2179ULL }; -static const uint8_t blake2b_sigma[12][16] = -{ - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , - { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , - { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , - { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , - { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , - { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , - { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , - { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , - { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , - { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , - { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } -}; - /* Some helper functions, not necessarily useful */ static inline int blake2b_set_lastnode( blake2b_state *S ) @@ -247,6 +235,20 @@ int blake2b_init_key( blake2b_state *S, const uint8_t outlen, const void *key, c return 0; } +#if defined(HAVE_AVX2) + +static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) +{ + __m256i a = LOADU(&S->h[0]); + __m256i b = LOADU(&S->h[4]); + BLAKE2B_COMPRESS_V1(a, b, block, S->t[0], S->t[1], S->f[0], S->f[1]); + STOREU(&S->h[0], a); + STOREU(&S->h[4], b); + + return 0; +} +#else + static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) { __m128i row1l, row1h; @@ -316,6 +318,7 @@ static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2 STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); return 0; } +#endif int blake2b_update( blake2b_state *S, const uint8_t *in, uint64_t inlen )