From c0e269286e0cc812e55d6e287147e2c48adb66a8 Mon Sep 17 00:00:00 2001 From: Sergey Kroshnin Date: Fri, 17 May 2019 17:42:31 +0300 Subject: [PATCH] Apply Blake2b AVX2 changes (#1994) https://github.com/sneves/blake2-avx2/pull/4 About 10% speed up: tweak diagonal shuffle --- crypto/blake2/blake2b-compress-avx2.h | 36 ++++----- crypto/blake2/blake2b-load-avx2-simple.h | 48 ++++++------ crypto/blake2/blake2b-load-avx2.h | 96 ++++++++++++------------ 3 files changed, 90 insertions(+), 90 deletions(-) diff --git a/crypto/blake2/blake2b-compress-avx2.h b/crypto/blake2/blake2b-compress-avx2.h index 3deeece8..d763261e 100644 --- a/crypto/blake2/blake2b-compress-avx2.h +++ b/crypto/blake2/blake2b-compress-avx2.h @@ -70,16 +70,16 @@ LOADU64(const void *p) #define BLAKE2B_DIAG_V1(a, b, c, d) \ do { \ - d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(2, 1, 0, 3)); \ - c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ - b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(0, 3, 2, 1)); \ + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(2, 1, 0, 3)); \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1, 0, 3, 2)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(0, 3, 2, 1)); \ } while (0) #define BLAKE2B_UNDIAG_V1(a, b, c, d) \ do { \ - d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(0, 3, 2, 1)); \ - c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(1, 0, 3, 2)); \ - b = _mm256_permute4x64_epi64(b, _MM_SHUFFLE(2, 1, 0, 3)); \ + a = _mm256_permute4x64_epi64(a, _MM_SHUFFLE(0, 3, 2, 1)); \ + d = _mm256_permute4x64_epi64(d, _MM_SHUFFLE(1, 0, 3, 2)); \ + c = _mm256_permute4x64_epi64(c, _MM_SHUFFLE(2, 1, 0, 3)); \ } while (0) #if defined(PERMUTE_WITH_SHUFFLES) @@ -91,18 +91,18 @@ LOADU64(const void *p) #if defined(PERMUTE_WITH_GATHER) ALIGN(64) static const uint32_t indices[12][16] = { - { 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, - {14, 4, 9, 13,10, 8, 15, 6, 1, 0, 11, 5,12, 2, 7, 3}, - {11, 12, 5, 15, 8, 0, 2, 13,10, 3, 7, 9,14, 6, 1, 4}, - { 7, 3, 13, 11, 9, 1, 12, 14, 2, 5, 4, 15, 6, 10, 0, 8}, - { 9, 5, 2, 10, 0, 7, 4, 15,14, 11, 6, 3, 1, 12, 8, 13}, - { 2, 6, 0, 8,12, 10, 11, 3, 4, 7, 15, 1,13, 5, 14, 9}, - {12, 1, 14, 4, 5, 15, 13, 10, 0, 6, 9, 8, 7, 3, 2, 11}, - {13, 7, 12, 3,11, 14, 1, 9, 5, 15, 8, 2, 0, 4, 6, 10}, - { 6, 14, 11, 0,15, 9, 3, 8,12, 13, 1, 10, 2, 7, 4, 5}, - {10, 8, 7, 1, 2, 4, 6, 5,15, 9, 3, 13,11, 14, 12, 0}, - { 0, 2, 4, 6, 1, 3, 5, 7, 8, 10, 12, 14, 9, 11, 13, 15}, - {14, 4, 9, 13,10, 8, 15, 6, 1, 0, 11, 5,12, 2, 7, 3}, + { 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13}, + {14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7}, + {11, 12, 5, 15, 8, 0, 2, 13, 9, 10, 3, 7, 4, 14, 6, 1}, + { 7, 3, 13, 11, 9, 1, 12, 14, 15, 2, 5, 4, 8, 6, 10, 0}, + { 9, 5, 2, 10, 0, 7, 4, 15, 3, 14, 11, 6, 13, 1, 12, 8}, + { 2, 6, 0, 8, 12, 10, 11, 3, 1, 4, 7, 15, 9, 13, 5, 14}, + {12, 1, 14, 4, 5, 15, 13, 10, 8, 0, 6, 9, 11, 7, 3, 2}, + {13, 7, 12, 3, 11, 14, 1, 9, 2, 5, 15, 8, 10, 0, 4, 6}, + { 6, 14, 11, 0, 15, 9, 3, 8, 10, 12, 13, 1, 5, 2, 7, 4}, + {10, 8, 7, 1, 2, 4, 6, 5, 13, 15, 9, 3, 0, 11, 14, 12}, + { 0, 2, 4, 6, 1, 3, 5, 7, 14, 8, 10, 12, 15, 9, 11, 13}, + {14, 4, 9, 13, 10, 8, 15, 6, 5, 1, 0, 11, 3, 12, 2, 7}, }; #define BLAKE2B_ROUND_V1(a, b, c, d, r, m) do { \ diff --git a/crypto/blake2/blake2b-load-avx2-simple.h b/crypto/blake2/blake2b-load-avx2-simple.h index 2d6fa01e..1c29c23d 100644 --- a/crypto/blake2/blake2b-load-avx2-simple.h +++ b/crypto/blake2/blake2b-load-avx2-simple.h @@ -3,52 +3,52 @@ #define BLAKE2B_LOAD_MSG_0_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0); #define BLAKE2B_LOAD_MSG_0_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1); -#define BLAKE2B_LOAD_MSG_0_3(b0) b0 = _mm256_set_epi64x(m14, m12, m10, m8); -#define BLAKE2B_LOAD_MSG_0_4(b0) b0 = _mm256_set_epi64x(m15, m13, m11, m9); +#define BLAKE2B_LOAD_MSG_0_3(b0) b0 = _mm256_set_epi64x(m12, m10, m8, m14); +#define BLAKE2B_LOAD_MSG_0_4(b0) b0 = _mm256_set_epi64x(m13, m11, m9, m15); #define BLAKE2B_LOAD_MSG_1_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14); #define BLAKE2B_LOAD_MSG_1_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10); -#define BLAKE2B_LOAD_MSG_1_3(b0) b0 = _mm256_set_epi64x(m5, m11, m0, m1); -#define BLAKE2B_LOAD_MSG_1_4(b0) b0 = _mm256_set_epi64x(m3, m7, m2, m12); +#define BLAKE2B_LOAD_MSG_1_3(b0) b0 = _mm256_set_epi64x(m11, m0, m1, m5); +#define BLAKE2B_LOAD_MSG_1_4(b0) b0 = _mm256_set_epi64x(m7, m2, m12, m3); #define BLAKE2B_LOAD_MSG_2_1(b0) b0 = _mm256_set_epi64x(m15, m5, m12, m11); #define BLAKE2B_LOAD_MSG_2_2(b0) b0 = _mm256_set_epi64x(m13, m2, m0, m8); -#define BLAKE2B_LOAD_MSG_2_3(b0) b0 = _mm256_set_epi64x(m9, m7, m3, m10); -#define BLAKE2B_LOAD_MSG_2_4(b0) b0 = _mm256_set_epi64x(m4, m1, m6, m14); +#define BLAKE2B_LOAD_MSG_2_3(b0) b0 = _mm256_set_epi64x(m7, m3, m10, m9); +#define BLAKE2B_LOAD_MSG_2_4(b0) b0 = _mm256_set_epi64x(m1, m6, m14, m4); #define BLAKE2B_LOAD_MSG_3_1(b0) b0 = _mm256_set_epi64x(m11, m13, m3, m7); #define BLAKE2B_LOAD_MSG_3_2(b0) b0 = _mm256_set_epi64x(m14, m12, m1, m9); -#define BLAKE2B_LOAD_MSG_3_3(b0) b0 = _mm256_set_epi64x(m15, m4, m5, m2); -#define BLAKE2B_LOAD_MSG_3_4(b0) b0 = _mm256_set_epi64x(m8, m0, m10, m6); +#define BLAKE2B_LOAD_MSG_3_3(b0) b0 = _mm256_set_epi64x(m4, m5, m2, m15); +#define BLAKE2B_LOAD_MSG_3_4(b0) b0 = _mm256_set_epi64x(m0, m10, m6, m8); #define BLAKE2B_LOAD_MSG_4_1(b0) b0 = _mm256_set_epi64x(m10, m2, m5, m9); #define BLAKE2B_LOAD_MSG_4_2(b0) b0 = _mm256_set_epi64x(m15, m4, m7, m0); -#define BLAKE2B_LOAD_MSG_4_3(b0) b0 = _mm256_set_epi64x(m3, m6, m11, m14); -#define BLAKE2B_LOAD_MSG_4_4(b0) b0 = _mm256_set_epi64x(m13, m8, m12, m1); +#define BLAKE2B_LOAD_MSG_4_3(b0) b0 = _mm256_set_epi64x(m6, m11, m14, m3); +#define BLAKE2B_LOAD_MSG_4_4(b0) b0 = _mm256_set_epi64x(m8, m12, m1, m13); #define BLAKE2B_LOAD_MSG_5_1(b0) b0 = _mm256_set_epi64x(m8, m0, m6, m2); #define BLAKE2B_LOAD_MSG_5_2(b0) b0 = _mm256_set_epi64x(m3, m11, m10, m12); -#define BLAKE2B_LOAD_MSG_5_3(b0) b0 = _mm256_set_epi64x(m1, m15, m7, m4); -#define BLAKE2B_LOAD_MSG_5_4(b0) b0 = _mm256_set_epi64x(m9, m14, m5, m13); +#define BLAKE2B_LOAD_MSG_5_3(b0) b0 = _mm256_set_epi64x(m15, m7, m4, m1); +#define BLAKE2B_LOAD_MSG_5_4(b0) b0 = _mm256_set_epi64x(m14, m5, m13, m9); #define BLAKE2B_LOAD_MSG_6_1(b0) b0 = _mm256_set_epi64x(m4, m14, m1, m12); #define BLAKE2B_LOAD_MSG_6_2(b0) b0 = _mm256_set_epi64x(m10, m13, m15, m5); -#define BLAKE2B_LOAD_MSG_6_3(b0) b0 = _mm256_set_epi64x(m8, m9, m6, m0); -#define BLAKE2B_LOAD_MSG_6_4(b0) b0 = _mm256_set_epi64x(m11, m2, m3, m7); +#define BLAKE2B_LOAD_MSG_6_3(b0) b0 = _mm256_set_epi64x(m9, m6, m0, m8); +#define BLAKE2B_LOAD_MSG_6_4(b0) b0 = _mm256_set_epi64x(m2, m3, m7, m11); #define BLAKE2B_LOAD_MSG_7_1(b0) b0 = _mm256_set_epi64x(m3, m12, m7, m13); #define BLAKE2B_LOAD_MSG_7_2(b0) b0 = _mm256_set_epi64x(m9, m1, m14, m11); -#define BLAKE2B_LOAD_MSG_7_3(b0) b0 = _mm256_set_epi64x(m2, m8, m15, m5); -#define BLAKE2B_LOAD_MSG_7_4(b0) b0 = _mm256_set_epi64x(m10, m6, m4, m0); +#define BLAKE2B_LOAD_MSG_7_3(b0) b0 = _mm256_set_epi64x(m8, m15, m5, m2); +#define BLAKE2B_LOAD_MSG_7_4(b0) b0 = _mm256_set_epi64x(m6, m4, m0, m10); #define BLAKE2B_LOAD_MSG_8_1(b0) b0 = _mm256_set_epi64x(m0, m11, m14, m6); #define BLAKE2B_LOAD_MSG_8_2(b0) b0 = _mm256_set_epi64x(m8, m3, m9, m15); -#define BLAKE2B_LOAD_MSG_8_3(b0) b0 = _mm256_set_epi64x(m10, m1, m13, m12); -#define BLAKE2B_LOAD_MSG_8_4(b0) b0 = _mm256_set_epi64x(m5, m4, m7, m2); +#define BLAKE2B_LOAD_MSG_8_3(b0) b0 = _mm256_set_epi64x(m1, m13, m12, m10); +#define BLAKE2B_LOAD_MSG_8_4(b0) b0 = _mm256_set_epi64x(m4, m7, m2, m5); #define BLAKE2B_LOAD_MSG_9_1(b0) b0 = _mm256_set_epi64x(m1, m7, m8, m10); #define BLAKE2B_LOAD_MSG_9_2(b0) b0 = _mm256_set_epi64x(m5, m6, m4, m2); -#define BLAKE2B_LOAD_MSG_9_3(b0) b0 = _mm256_set_epi64x(m13, m3, m9, m15); -#define BLAKE2B_LOAD_MSG_9_4(b0) b0 = _mm256_set_epi64x(m0, m12, m14, m11); +#define BLAKE2B_LOAD_MSG_9_3(b0) b0 = _mm256_set_epi64x(m3, m9, m15, m13); +#define BLAKE2B_LOAD_MSG_9_4(b0) b0 = _mm256_set_epi64x(m12, m14, m11, m0); #define BLAKE2B_LOAD_MSG_10_1(b0) b0 = _mm256_set_epi64x(m6, m4, m2, m0); #define BLAKE2B_LOAD_MSG_10_2(b0) b0 = _mm256_set_epi64x(m7, m5, m3, m1); -#define BLAKE2B_LOAD_MSG_10_3(b0) b0 = _mm256_set_epi64x(m14, m12, m10, m8); -#define BLAKE2B_LOAD_MSG_10_4(b0) b0 = _mm256_set_epi64x(m15, m13, m11, m9); +#define BLAKE2B_LOAD_MSG_10_3(b0) b0 = _mm256_set_epi64x(m12, m10, m8, m14); +#define BLAKE2B_LOAD_MSG_10_4(b0) b0 = _mm256_set_epi64x(m13, m11, m9, m15); #define BLAKE2B_LOAD_MSG_11_1(b0) b0 = _mm256_set_epi64x(m13, m9, m4, m14); #define BLAKE2B_LOAD_MSG_11_2(b0) b0 = _mm256_set_epi64x(m6, m15, m8, m10); -#define BLAKE2B_LOAD_MSG_11_3(b0) b0 = _mm256_set_epi64x(m5, m11, m0, m1); -#define BLAKE2B_LOAD_MSG_11_4(b0) b0 = _mm256_set_epi64x(m3, m7, m2, m12); +#define BLAKE2B_LOAD_MSG_11_3(b0) b0 = _mm256_set_epi64x(m11, m0, m1, m5); +#define BLAKE2B_LOAD_MSG_11_4(b0) b0 = _mm256_set_epi64x(m7, m2, m12, m3); #endif diff --git a/crypto/blake2/blake2b-load-avx2.h b/crypto/blake2/blake2b-load-avx2.h index c865c9ad..28278f40 100644 --- a/crypto/blake2/blake2b-load-avx2.h +++ b/crypto/blake2/blake2b-load-avx2.h @@ -16,15 +16,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_0_3(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m4, m5);\ -t1 = _mm256_unpacklo_epi64(m6, m7);\ +t0 = _mm256_unpacklo_epi64(m7, m4);\ +t1 = _mm256_unpacklo_epi64(m5, m6);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_0_4(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m4, m5);\ -t1 = _mm256_unpackhi_epi64(m6, m7);\ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m5, m6);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -44,15 +44,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_1_3(b0) \ do { \ -t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ -t1 = _mm256_unpackhi_epi64(m5, m2);\ +t0 = _mm256_unpackhi_epi64(m2, m0);\ +t1 = _mm256_blend_epi32(m5, m0, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_1_4(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m6, m1);\ -t1 = _mm256_unpackhi_epi64(m3, m1);\ +t0 = _mm256_alignr_epi8(m6, m1, 8);\ +t1 = _mm256_blend_epi32(m3, m1, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -72,15 +72,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_2_3(b0) \ do { \ -t0 = _mm256_blend_epi32(m1, m5, 0x33);\ -t1 = _mm256_unpackhi_epi64(m3, m4);\ +t0 = _mm256_alignr_epi8(m5, m4, 8);\ +t1 = _mm256_unpackhi_epi64(m1, m3);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_2_4(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m7, m3);\ -t1 = _mm256_alignr_epi8(m2, m0, 8);\ +t0 = _mm256_unpacklo_epi64(m2, m7);\ +t1 = _mm256_blend_epi32(m0, m3, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -100,15 +100,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_3_3(b0) \ do { \ -t0 = _mm256_blend_epi32(m2, m1, 0x33);\ -t1 = _mm256_blend_epi32(m7, m2, 0x33);\ +t0 = _mm256_alignr_epi8(m1, m7, 8);\ +t1 = _mm256_shuffle_epi32(m2, _MM_SHUFFLE(1,0,3,2));\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_3_4(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m3, m5);\ -t1 = _mm256_unpacklo_epi64(m0, m4);\ +t0 = _mm256_unpacklo_epi64(m4, m3);\ +t1 = _mm256_unpacklo_epi64(m5, m0);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -128,15 +128,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_4_3(b0) \ do { \ -t0 = _mm256_blend_epi32(m5, m7, 0x33);\ -t1 = _mm256_blend_epi32(m1, m3, 0x33);\ +t0 = _mm256_alignr_epi8(m7, m1, 8);\ +t1 = _mm256_alignr_epi8(m3, m5, 8);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_4_4(b0) \ do { \ -t0 = _mm256_alignr_epi8(m6, m0, 8);\ -t1 = _mm256_blend_epi32(m6, m4, 0x33);\ +t0 = _mm256_unpackhi_epi64(m6, m0);\ +t1 = _mm256_unpacklo_epi64(m6, m4);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -156,15 +156,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_5_3(b0) \ do { \ -t0 = _mm256_blend_epi32(m3, m2, 0x33);\ -t1 = _mm256_unpackhi_epi64(m7, m0);\ +t0 = _mm256_alignr_epi8(m2, m0, 8);\ +t1 = _mm256_unpackhi_epi64(m3, m7);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_5_4(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m6, m2);\ -t1 = _mm256_blend_epi32(m4, m7, 0x33);\ +t0 = _mm256_unpackhi_epi64(m4, m6);\ +t1 = _mm256_alignr_epi8(m7, m2, 8);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -184,15 +184,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_6_3(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m0, m3);\ -t1 = _mm256_shuffle_epi32(m4, _MM_SHUFFLE(1,0,3,2));\ +t0 = _mm256_unpacklo_epi64(m4, m0);\ +t1 = _mm256_blend_epi32(m4, m3, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_6_4(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m3, m1);\ -t1 = _mm256_blend_epi32(m5, m1, 0x33);\ +t0 = _mm256_unpackhi_epi64(m5, m3);\ +t1 = _mm256_shuffle_epi32(m1, _MM_SHUFFLE(1,0,3,2));\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -212,15 +212,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_7_3(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m2, m7);\ -t1 = _mm256_unpacklo_epi64(m4, m1);\ +t0 = _mm256_blend_epi32(m2, m1, 0x33);\ +t1 = _mm256_alignr_epi8(m4, m7, 8);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_7_4(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m0, m2);\ -t1 = _mm256_unpacklo_epi64(m3, m5);\ +t0 = _mm256_unpacklo_epi64(m5, m0);\ +t1 = _mm256_unpacklo_epi64(m2, m3);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -240,15 +240,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_8_3(b0) \ do { \ -t0 = m6;\ -t1 = _mm256_alignr_epi8(m5, m0, 8);\ +t0 = _mm256_unpacklo_epi64(m5, m6);\ +t1 = _mm256_unpackhi_epi64(m6, m0);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_8_4(b0) \ do { \ -t0 = _mm256_blend_epi32(m3, m1, 0x33);\ -t1 = m2;\ +t0 = _mm256_alignr_epi8(m1, m2, 8);\ +t1 = _mm256_alignr_epi8(m2, m3, 8);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -268,15 +268,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_9_3(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m7, m4);\ -t1 = _mm256_unpackhi_epi64(m1, m6);\ +t0 = _mm256_unpackhi_epi64(m6, m7);\ +t1 = _mm256_unpackhi_epi64(m4, m1);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_9_4(b0) \ do { \ -t0 = _mm256_alignr_epi8(m7, m5, 8);\ -t1 = _mm256_unpacklo_epi64(m6, m0);\ +t0 = _mm256_blend_epi32(m5, m0, 0x33);\ +t1 = _mm256_unpacklo_epi64(m7, m6);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -296,15 +296,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_10_3(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m4, m5);\ -t1 = _mm256_unpacklo_epi64(m6, m7);\ +t0 = _mm256_unpacklo_epi64(m7, m4);\ +t1 = _mm256_unpacklo_epi64(m5, m6);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_10_4(b0) \ do { \ -t0 = _mm256_unpackhi_epi64(m4, m5);\ -t1 = _mm256_unpackhi_epi64(m6, m7);\ +t0 = _mm256_unpackhi_epi64(m7, m4);\ +t1 = _mm256_unpackhi_epi64(m5, m6);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) @@ -324,15 +324,15 @@ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ #define BLAKE2B_LOAD_MSG_11_3(b0) \ do { \ -t0 = _mm256_shuffle_epi32(m0, _MM_SHUFFLE(1,0,3,2));\ -t1 = _mm256_unpackhi_epi64(m5, m2);\ +t0 = _mm256_unpackhi_epi64(m2, m0);\ +t1 = _mm256_blend_epi32(m5, m0, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0) #define BLAKE2B_LOAD_MSG_11_4(b0) \ do { \ -t0 = _mm256_unpacklo_epi64(m6, m1);\ -t1 = _mm256_unpackhi_epi64(m3, m1);\ +t0 = _mm256_alignr_epi8(m6, m1, 8);\ +t1 = _mm256_blend_epi32(m3, m1, 0x33);\ b0 = _mm256_blend_epi32(t0, t1, 0xF0);\ } while(0)