This commit is contained in:
techknowlogick 2021-02-28 18:08:33 -05:00 committed by GitHub
parent 030646eea4
commit 47f6a4ec3f
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
947 changed files with 26119 additions and 7062 deletions

View file

@ -116,6 +116,8 @@ BenchmarkParallel/8MB-4 2182.48 17252.88 7.91x
These measurements were performed on AWS EC2 instance of type `c5.xlarge` equipped with a Xeon Platinum 8124M CPU at 3.0 GHz.
If only one or two inputs are available the scalar calculation method will be used for the
optimal speed in these cases.
## Operation

View file

@ -1,132 +0,0 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE.Golang file.
// Code generated by go run gen.go -output md5block.go; DO NOT EDIT.
package md5simd
import (
"encoding/binary"
"math/bits"
)
type digest struct {
s [4]uint32
x [BlockSize]byte
nx int
len uint64
}
func blockGeneric(dig *digest, p []byte) {
// load state
a, b, c, d := dig.s[0], dig.s[1], dig.s[2], dig.s[3]
for i := 0; i <= len(p)-BlockSize; i += BlockSize {
// eliminate bounds checks on p
q := p[i:]
q = q[:BlockSize:BlockSize]
// save current state
aa, bb, cc, dd := a, b, c, d
// load input block
x0 := binary.LittleEndian.Uint32(q[4*0x0:])
x1 := binary.LittleEndian.Uint32(q[4*0x1:])
x2 := binary.LittleEndian.Uint32(q[4*0x2:])
x3 := binary.LittleEndian.Uint32(q[4*0x3:])
x4 := binary.LittleEndian.Uint32(q[4*0x4:])
x5 := binary.LittleEndian.Uint32(q[4*0x5:])
x6 := binary.LittleEndian.Uint32(q[4*0x6:])
x7 := binary.LittleEndian.Uint32(q[4*0x7:])
x8 := binary.LittleEndian.Uint32(q[4*0x8:])
x9 := binary.LittleEndian.Uint32(q[4*0x9:])
xa := binary.LittleEndian.Uint32(q[4*0xa:])
xb := binary.LittleEndian.Uint32(q[4*0xb:])
xc := binary.LittleEndian.Uint32(q[4*0xc:])
xd := binary.LittleEndian.Uint32(q[4*0xd:])
xe := binary.LittleEndian.Uint32(q[4*0xe:])
xf := binary.LittleEndian.Uint32(q[4*0xf:])
// round 1
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x0+0xd76aa478, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x1+0xe8c7b756, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x2+0x242070db, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x3+0xc1bdceee, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x4+0xf57c0faf, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x5+0x4787c62a, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+x6+0xa8304613, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+x7+0xfd469501, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+x8+0x698098d8, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+x9+0x8b44f7af, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xa+0xffff5bb1, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xb+0x895cd7be, 22)
a = b + bits.RotateLeft32((((c^d)&b)^d)+a+xc+0x6b901122, 7)
d = a + bits.RotateLeft32((((b^c)&a)^c)+d+xd+0xfd987193, 12)
c = d + bits.RotateLeft32((((a^b)&d)^b)+c+xe+0xa679438e, 17)
b = c + bits.RotateLeft32((((d^a)&c)^a)+b+xf+0x49b40821, 22)
// round 2
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x1+0xf61e2562, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x6+0xc040b340, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xb+0x265e5a51, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x0+0xe9b6c7aa, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x5+0xd62f105d, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xa+0x02441453, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+xf+0xd8a1e681, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x4+0xe7d3fbc8, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+x9+0x21e1cde6, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+xe+0xc33707d6, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x3+0xf4d50d87, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+x8+0x455a14ed, 20)
a = b + bits.RotateLeft32((((b^c)&d)^c)+a+xd+0xa9e3e905, 5)
d = a + bits.RotateLeft32((((a^b)&c)^b)+d+x2+0xfcefa3f8, 9)
c = d + bits.RotateLeft32((((d^a)&b)^a)+c+x7+0x676f02d9, 14)
b = c + bits.RotateLeft32((((c^d)&a)^d)+b+xc+0x8d2a4c8a, 20)
// round 3
a = b + bits.RotateLeft32((b^c^d)+a+x5+0xfffa3942, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x8+0x8771f681, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xb+0x6d9d6122, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xe+0xfde5380c, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x1+0xa4beea44, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x4+0x4bdecfa9, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x7+0xf6bb4b60, 16)
b = c + bits.RotateLeft32((c^d^a)+b+xa+0xbebfbc70, 23)
a = b + bits.RotateLeft32((b^c^d)+a+xd+0x289b7ec6, 4)
d = a + bits.RotateLeft32((a^b^c)+d+x0+0xeaa127fa, 11)
c = d + bits.RotateLeft32((d^a^b)+c+x3+0xd4ef3085, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x6+0x04881d05, 23)
a = b + bits.RotateLeft32((b^c^d)+a+x9+0xd9d4d039, 4)
d = a + bits.RotateLeft32((a^b^c)+d+xc+0xe6db99e5, 11)
c = d + bits.RotateLeft32((d^a^b)+c+xf+0x1fa27cf8, 16)
b = c + bits.RotateLeft32((c^d^a)+b+x2+0xc4ac5665, 23)
// round 4
a = b + bits.RotateLeft32((c^(b|^d))+a+x0+0xf4292244, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x7+0x432aff97, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xe+0xab9423a7, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x5+0xfc93a039, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+xc+0x655b59c3, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+x3+0x8f0ccc92, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+xa+0xffeff47d, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x1+0x85845dd1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x8+0x6fa87e4f, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xf+0xfe2ce6e0, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x6+0xa3014314, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+xd+0x4e0811a1, 21)
a = b + bits.RotateLeft32((c^(b|^d))+a+x4+0xf7537e82, 6)
d = a + bits.RotateLeft32((b^(a|^c))+d+xb+0xbd3af235, 10)
c = d + bits.RotateLeft32((a^(d|^b))+c+x2+0x2ad7d2bb, 15)
b = c + bits.RotateLeft32((d^(c|^a))+b+x9+0xeb86d391, 21)
// add saved state
a += aa
b += bb
c += cc
d += dd
}
// save state
dig.s[0], dig.s[1], dig.s[2], dig.s[3] = a, b, c, d
}

View file

@ -2,70 +2,72 @@
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
//+build !noasm,!appengine,gc
// This is the AVX512 implementation of the MD5 block function (16-way parallel)
#define prep(index) \
KMOVQ kmask, ktmp \
KMOVQ kmask, ktmp \
VPGATHERDD index*4(base)(ptrs*1), ktmp, mem
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
prep(index) \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND1noload(a, b, c, d, const, shift) \
VXORPS c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
VPXORQ c, tmp, tmp \
VPADDD 64*const(consts), a, a \
VPADDD mem, a, a \
VPTERNLOGD $0x6C, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD c, tmp \
VPADDD b, a, a
#define ROUND2(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPS c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VANDNPD c, tmp, tmp \
VPTERNLOGD $0xEC, b, tmp, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
VPROLD $shift, a, a \
VPADDD b, a, a
#define ROUND3(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x96, b, d, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VMOVAPD b, tmp \
VPADDD b, a, a
#define ROUND4(a, b, c, d, zreg, const, shift) \
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VXORPS c, ones, tmp \
VPADDD b, a, a
VPADDD 64*const(consts), a, a \
VPADDD zreg, a, a \
VPTERNLOGD $0x36, b, c, tmp \
VPADDD tmp, a, a \
VPROLD $shift, a, a \
VPXORQ c, ones, tmp \
VPADDD b, a, a
TEXT ·block16(SB),4,$0-40
TEXT ·block16(SB), 4, $0-40
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
MOVQ state+0(FP), BX
MOVQ base+8(FP), SI
MOVQ ptrs+16(FP), AX
KMOVQ mask+24(FP), K1
MOVQ n+32(FP), DX
MOVQ ·avx512md5consts+0(SB), DI
#define a Z0
#define b Z1
@ -90,7 +92,6 @@ TEXT ·block16(SB),4,$0-40
// Registers Z16 through to Z31 are used for caching purposes
// ----------------------------------------------------------
#define dig BX
#define count DX
#define base SI
@ -105,7 +106,7 @@ TEXT ·block16(SB),4,$0-40
// load source pointers
VMOVUPD 0x00(AX), ptrs
MOVQ $-1, AX
MOVQ $-1, AX
VPBROADCASTQ AX, ones
loop:
@ -190,7 +191,7 @@ loop:
ROUND3(c,d,a,b, Z31,0x2e,16)
ROUND3(b,c,d,a, Z18,0x2f,23)
VXORPS d, ones, tmp
VPXORQ d, ones, tmp
ROUND4(a,b,c,d, Z16,0x30, 6)
ROUND4(d,a,b,c, Z23,0x31,10)

View file

@ -1,3 +1,5 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2018 Igneous Systems
// MIT License
//
@ -70,7 +72,7 @@ TEXT ·block8(SB), 4, $0-40
#define consts DI
#define prepmask \
VXORPS mask, mask, mask \
VPXOR mask, mask, mask \
VPCMPGTD mask, off, mask
#define prep(index) \
@ -86,14 +88,14 @@ TEXT ·block8(SB), 4, $0-40
#define roll(shift, a) \
VPSLLD $shift, a, rtmp1 \
VPSRLD $32-shift, a, a \
VORPS rtmp1, a, a
VPOR rtmp1, a, a
#define ROUND1(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
prep(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -101,11 +103,11 @@ TEXT ·block8(SB), 4, $0-40
VPADDD b, a, a
#define ROUND1load(a, b, c, d, index, const, shift) \
VXORPS c, tmp, tmp \
VXORPD c, tmp, tmp \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp, tmp \
VXORPS d, tmp, tmp \
VPAND b, tmp, tmp \
VPXOR d, tmp, tmp \
load(index) \
VPADDD tmp, a, a \
roll(shift,a) \
@ -115,10 +117,10 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND2(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VANDPS b, tmp2, tmp2 \
VANDNPS c, tmp, tmp \
VPAND b, tmp2, tmp2 \
VANDNPD c, tmp, tmp \
load(index) \
VORPS tmp, tmp2, tmp2 \
VPOR tmp, tmp2, tmp2 \
VMOVAPD c, tmp \
VPADDD tmp2, a, a \
VMOVAPD c, tmp2 \
@ -129,8 +131,8 @@ TEXT ·block8(SB), 4, $0-40
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
load(index) \
VXORPS d, tmp, tmp \
VXORPS b, tmp, tmp \
VPXOR d, tmp, tmp \
VPXOR b, tmp, tmp \
VPADDD tmp, a, a \
roll(shift,a) \
VMOVAPD b, tmp \
@ -139,12 +141,12 @@ TEXT ·block8(SB), 4, $0-40
#define ROUND4(a, b, c, d, index, const, shift) \
VPADDD 32*const(consts), a, a \
VPADDD mem, a, a \
VORPS b, tmp, tmp \
VXORPS c, tmp, tmp \
VPOR b, tmp, tmp \
VPXOR c, tmp, tmp \
VPADDD tmp, a, a \
load(index) \
roll(shift,a) \
VXORPS c, ones, tmp \
VPXOR c, ones, tmp \
VPADDD b, a, a
// load digest into state registers
@ -242,7 +244,7 @@ loop:
ROUND3(b,c,d,a, 0,0x2f,23)
load(0)
VXORPS d, ones, tmp
VPXOR d, ones, tmp
ROUND4(a,b,c,d, 7,0x30, 6)
ROUND4(d,a,b,c,14,0x31,10)

View file

@ -9,14 +9,18 @@ package md5simd
import (
"fmt"
"math"
"sync"
"unsafe"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
var hasAVX512 bool
func init() {
// VANDNPD requires AVX512DQ. Technically it could be VPTERNLOGQ which is AVX512F.
hasAVX512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ)
}
//go:noescape
func block8(state *uint32, base uintptr, bufs *int32, cache *byte, n int)
@ -82,45 +86,52 @@ var avx512md5consts = func(c []uint32) []uint32 {
return inf
}(md5consts[:])
func init() {
hasAVX512 = cpuid.CPU.AVX512F()
}
// Interface function to assembly code
func (s *md5Server) blockMd5_x16(d *digest16, input [16][]byte, half bool) {
if hasAVX512 {
blockMd5_avx512(d, input, s.allBufs, &s.maskRounds16)
} else {
d8a, d8b := digest8{}, digest8{}
for i := range d8a.v0 {
j := i + 8
d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
if !half {
d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
}
return
}
i8 := [2][8][]byte{}
for i := range i8[0] {
i8[0][i], i8[1][i] = input[i], input[8+i]
}
if half {
blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a)
} else {
wg := sync.WaitGroup{}
wg.Add(2)
go func() { blockMd5_avx2(&d8a, i8[0], s.allBufs, &s.maskRounds8a); wg.Done() }()
go func() { blockMd5_avx2(&d8b, i8[1], s.allBufs, &s.maskRounds8b); wg.Done() }()
wg.Wait()
}
// Preparing data using copy is slower since copies aren't inlined.
for i := range d8a.v0 {
j := i + 8
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = d8a.v0[i], d8a.v1[i], d8a.v2[i], d8a.v3[i]
if !half {
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = d8b.v0[i], d8b.v1[i], d8b.v2[i], d8b.v3[i]
}
// Calculate on this goroutine
if half {
for i := range s.i8[0][:] {
s.i8[0][i] = input[i]
}
for i := range s.d8a.v0[:] {
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
}
blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a)
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
return
}
for i := range s.i8[0][:] {
s.i8[0][i], s.i8[1][i] = input[i], input[8+i]
}
for i := range s.d8a.v0[:] {
j := (i + 8) & 15
s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i] = d.v0[i], d.v1[i], d.v2[i], d.v3[i]
s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i] = d.v0[j], d.v1[j], d.v2[j], d.v3[j]
}
// Benchmarks appears to be slightly faster when spinning up 2 goroutines instead
// of using the current for one of the blocks.
s.wg.Add(2)
go func() { blockMd5_avx2(&s.d8a, s.i8[0], s.allBufs, &s.maskRounds8a); s.wg.Done() }()
go func() { blockMd5_avx2(&s.d8b, s.i8[1], s.allBufs, &s.maskRounds8b); s.wg.Done() }()
s.wg.Wait()
for i := range s.d8a.v0[:] {
d.v0[i], d.v1[i], d.v2[i], d.v3[i] = s.d8a.v0[i], s.d8a.v1[i], s.d8a.v2[i], s.d8a.v3[i]
}
for i := range s.d8b.v0[:] {
j := (i + 8) & 15
d.v0[j], d.v1[j], d.v2[j], d.v3[j] = s.d8b.v0[i], s.d8b.v1[i], s.d8b.v2[i], s.d8b.v3[i]
}
}

View file

@ -2,6 +2,4 @@ module github.com/minio/md5-simd
go 1.14
require (
github.com/klauspost/cpuid v1.2.3
)
require github.com/klauspost/cpuid/v2 v2.0.1

View file

@ -1,2 +1,2 @@
github.com/klauspost/cpuid v1.2.3 h1:CCtW0xUnWGVINKvE/WWOYKdsPV6mawAtvQuSl8guwQs=
github.com/klauspost/cpuid v1.2.3/go.mod h1:Pj4uuM528wm8OyEC2QMXAi2YiTZ96dNQPGgoMS4s3ek=
github.com/klauspost/cpuid/v2 v2.0.1 h1:lb04bBEJoAoV48eHs4Eq0UyhmJCkRSdIjQ3uS8WJRM4=
github.com/klauspost/cpuid/v2 v2.0.1/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=

View file

@ -10,6 +10,7 @@ import (
"encoding/binary"
"errors"
"fmt"
"sync"
"sync/atomic"
)
@ -121,6 +122,14 @@ func (d *md5Digest) Close() {
}
}
var sumChPool sync.Pool
func init() {
sumChPool.New = func() interface{} {
return make(chan sumResult, 1)
}
}
// Sum - Return MD5 sum in bytes
func (d *md5Digest) Sum(in []byte) (result []byte) {
if d.blocksCh == nil {
@ -148,10 +157,11 @@ func (d *md5Digest) Sum(in []byte) (result []byte) {
if len(trail)%BlockSize != 0 {
panic(fmt.Errorf("internal error: sum block was not aligned. len=%d, nx=%d", len(trail), d.nx))
}
sumCh := make(chan sumResult, 1)
sumCh := sumChPool.Get().(chan sumResult)
d.sendBlock(blockInput{uid: d.uid, msg: trail, sumCh: sumCh}, true)
sum := <-sumCh
sumChPool.Put(sumCh)
return append(in, sum.digest[:]...)
}

View file

@ -10,8 +10,9 @@ import (
"encoding/binary"
"fmt"
"runtime"
"sync"
"github.com/klauspost/cpuid"
"github.com/klauspost/cpuid/v2"
)
// MD5 initialization constants
@ -23,6 +24,9 @@ const (
init1 = 0xefcdab89
init2 = 0x98badcfe
init3 = 0x10325476
// Use scalar routine when below this many lanes
useScalarBelow = 3
)
// md5ServerUID - Does not start at 0 but next multiple of 16 so as to be able to
@ -56,11 +60,15 @@ type md5Server struct {
maskRounds8b [8]maskRounds // Pre-allocated static array for max 8 rounds (2nd AVX2 core)
allBufs []byte // Preallocated buffer.
buffers chan []byte // Preallocated buffers, sliced from allBufs.
i8 [2][8][]byte // avx2 temporary vars
d8a, d8b digest8
wg sync.WaitGroup
}
// NewServer - Create new object for parallel processing handling
func NewServer() Server {
if !cpuid.CPU.AVX2() {
if !cpuid.CPU.Supports(cpuid.AVX2) {
return &fallbackServer{}
}
md5srv := &md5Server{}
@ -152,7 +160,7 @@ func (s *md5Server) process(newClients chan newClient) {
sum := sumResult{}
// Add end block to current digest.
blockGeneric(&dig, block.msg)
blockScalar(&dig.s, block.msg)
binary.LittleEndian.PutUint32(sum.digest[0:], dig.s[0])
binary.LittleEndian.PutUint32(sum.digest[4:], dig.s[1])
@ -262,6 +270,88 @@ func (s *md5Server) Close() {
// Invoke assembly and send results back
func (s *md5Server) blocks(lanes []blockInput) {
if len(lanes) < useScalarBelow {
// Use scalar routine when below this many lanes
switch len(lanes) {
case 0:
case 1:
lane := lanes[0]
var d digest
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) > 0 {
// Update...
blockScalar(&d.s, lane.msg)
}
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], d.s[0])
binary.LittleEndian.PutUint32(dig[4:], d.s[1])
binary.LittleEndian.PutUint32(dig[8:], d.s[2])
binary.LittleEndian.PutUint32(dig[12:], d.s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[0] = blockInput{}
default:
s.wg.Add(len(lanes))
var results [useScalarBelow]digest
for i := range lanes {
lane := lanes[i]
go func(i int) {
var d digest
defer s.wg.Done()
a, ok := s.digests[lane.uid]
if ok {
d.s[0] = binary.LittleEndian.Uint32(a[0:4])
d.s[1] = binary.LittleEndian.Uint32(a[4:8])
d.s[2] = binary.LittleEndian.Uint32(a[8:12])
d.s[3] = binary.LittleEndian.Uint32(a[12:16])
} else {
d.s[0] = init0
d.s[1] = init1
d.s[2] = init2
d.s[3] = init3
}
if len(lane.msg) == 0 {
results[i] = d
return
}
// Update...
blockScalar(&d.s, lane.msg)
results[i] = d
}(i)
}
s.wg.Wait()
for i, lane := range lanes {
dig := [Size]byte{}
binary.LittleEndian.PutUint32(dig[0:], results[i].s[0])
binary.LittleEndian.PutUint32(dig[4:], results[i].s[1])
binary.LittleEndian.PutUint32(dig[8:], results[i].s[2])
binary.LittleEndian.PutUint32(dig[12:], results[i].s[3])
s.digests[lane.uid] = dig
if lane.msg != nil {
s.buffers <- lane.msg
}
lanes[i] = blockInput{}
}
}
return
}
inputs := [16][]byte{}
for i := range lanes {
inputs[i] = lanes[i].msg

View file

@ -1,19 +1,21 @@
//+build !noasm,!appengine,gc
// Copyright (c) 2020 MinIO Inc. All rights reserved.
// Use of this source code is governed by a license that can be
// found in the LICENSE file.
package md5simd
import (
"sort"
)
// Helper struct for sorting blocks based on length
type lane struct {
len uint
pos uint
}
type digest struct {
s [4]uint32
}
// Helper struct for generating number of rounds in combination with mask for valid lanes
type maskRounds struct {
mask uint64
@ -23,15 +25,22 @@ type maskRounds struct {
func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [8]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}
@ -45,18 +54,24 @@ func generateMaskAndRounds8(input [8][]byte, mr *[8]maskRounds) (rounds int) {
}
func generateMaskAndRounds16(input [16][]byte, mr *[16]maskRounds) (rounds int) {
// Sort on blocks length small to large
var sorted [16]lane
for c, inpt := range input {
for c, inpt := range input[:] {
sorted[c] = lane{uint(len(inpt)), uint(c)}
for i := c - 1; i >= 0; i-- {
// swap so largest is at the end...
if sorted[i].len > sorted[i+1].len {
sorted[i], sorted[i+1] = sorted[i+1], sorted[i]
continue
}
break
}
}
sort.Slice(sorted[:], func(i, j int) bool { return sorted[i].len < sorted[j].len })
// Create mask array including 'rounds' (of processing blocks of 64 bytes) between masks
m, round := uint64(0xffff), uint64(0)
for _, s := range sorted {
for _, s := range sorted[:] {
if s.len > 0 {
if uint64(s.len)>>6 > round {
mr[rounds] = maskRounds{m, (uint64(s.len) >> 6) - round}

View file

@ -27,6 +27,12 @@ type Hasher interface {
Close()
}
// StdlibHasher returns a Hasher that uses the stdlib for hashing.
// Used hashers are stored in a pool for fast reuse.
func StdlibHasher() Hasher {
return &md5Wrapper{Hash: md5Pool.New().(hash.Hash)}
}
// md5Wrapper is a wrapper around the builtin hasher.
type md5Wrapper struct {
hash.Hash

11
vendor/github.com/minio/md5-simd/md5block_amd64.go generated vendored Normal file
View file

@ -0,0 +1,11 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
package md5simd
// Encode p to digest
//go:noescape
func blockScalar(dig *[4]uint32, p []byte)

714
vendor/github.com/minio/md5-simd/md5block_amd64.s generated vendored Normal file
View file

@ -0,0 +1,714 @@
// Code generated by command: go run gen.go -out ../md5block_amd64.s -stubs ../md5block_amd64.go -pkg=md5simd. DO NOT EDIT.
// +build !appengine
// +build !noasm
// +build gc
// func blockScalar(dig *[4]uint32, p []byte)
TEXT ·blockScalar(SB), $0-32
MOVQ p_len+16(FP), AX
MOVQ dig+0(FP), CX
MOVQ p_base+8(FP), DX
SHRQ $0x06, AX
SHLQ $0x06, AX
LEAQ (DX)(AX*1), AX
CMPQ DX, AX
JEQ end
MOVL (CX), BX
MOVL 4(CX), BP
MOVL 8(CX), SI
MOVL 12(CX), CX
MOVL $0xffffffff, DI
loop:
MOVL (DX), R8
MOVL CX, R9
MOVL BX, R10
MOVL BP, R11
MOVL SI, R12
MOVL CX, R13
// ROUND1
XORL SI, R9
ADDL $0xd76aa478, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 4(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xe8c7b756, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 8(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0x242070db, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 12(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xc1bdceee, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 16(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0xf57c0faf, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 20(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x4787c62a, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 24(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa8304613, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 28(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0xfd469501, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 32(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x698098d8, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 36(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0x8b44f7af, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 40(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xffff5bb1, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 44(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x895cd7be, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 48(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
XORL SI, R9
ADDL $0x6b901122, BX
ADDL R8, BX
ANDL BP, R9
XORL CX, R9
MOVL 52(DX), R8
ADDL R9, BX
ROLL $0x07, BX
MOVL SI, R9
ADDL BP, BX
XORL BP, R9
ADDL $0xfd987193, CX
ADDL R8, CX
ANDL BX, R9
XORL SI, R9
MOVL 56(DX), R8
ADDL R9, CX
ROLL $0x0c, CX
MOVL BP, R9
ADDL BX, CX
XORL BX, R9
ADDL $0xa679438e, SI
ADDL R8, SI
ANDL CX, R9
XORL BP, R9
MOVL 60(DX), R8
ADDL R9, SI
ROLL $0x11, SI
MOVL BX, R9
ADDL CX, SI
XORL CX, R9
ADDL $0x49b40821, BP
ADDL R8, BP
ANDL SI, R9
XORL BX, R9
MOVL 4(DX), R8
ADDL R9, BP
ROLL $0x16, BP
MOVL CX, R9
ADDL SI, BP
// ROUND2
MOVL CX, R9
MOVL CX, R14
XORL DI, R9
ADDL $0xf61e2562, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 24(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc040b340, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 44(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x265e5a51, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL (DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe9b6c7aa, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xd62f105d, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 40(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0x02441453, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 60(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xd8a1e681, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 16(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0xe7d3fbc8, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 36(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0x21e1cde6, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 56(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xc33707d6, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 12(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0xf4d50d87, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 32(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x455a14ed, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 52(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
XORL DI, R9
ADDL $0xa9e3e905, BX
ADDL R8, BX
ANDL BP, R14
ANDL SI, R9
MOVL 8(DX), R8
ORL R9, R14
MOVL SI, R9
ADDL R14, BX
MOVL SI, R14
ROLL $0x05, BX
ADDL BP, BX
XORL DI, R9
ADDL $0xfcefa3f8, CX
ADDL R8, CX
ANDL BX, R14
ANDL BP, R9
MOVL 28(DX), R8
ORL R9, R14
MOVL BP, R9
ADDL R14, CX
MOVL BP, R14
ROLL $0x09, CX
ADDL BX, CX
XORL DI, R9
ADDL $0x676f02d9, SI
ADDL R8, SI
ANDL CX, R14
ANDL BX, R9
MOVL 48(DX), R8
ORL R9, R14
MOVL BX, R9
ADDL R14, SI
MOVL BX, R14
ROLL $0x0e, SI
ADDL CX, SI
XORL DI, R9
ADDL $0x8d2a4c8a, BP
ADDL R8, BP
ANDL SI, R14
ANDL CX, R9
MOVL 20(DX), R8
ORL R9, R14
MOVL CX, R9
ADDL R14, BP
MOVL CX, R14
ROLL $0x14, BP
ADDL SI, BP
// ROUND3
MOVL SI, R9
ADDL $0xfffa3942, BX
ADDL R8, BX
MOVL 32(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x8771f681, CX
ADDL R8, CX
MOVL 44(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x6d9d6122, SI
ADDL R8, SI
MOVL 56(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xfde5380c, BP
ADDL R8, BP
MOVL 4(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xa4beea44, BX
ADDL R8, BX
MOVL 16(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0x4bdecfa9, CX
ADDL R8, CX
MOVL 28(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xf6bb4b60, SI
ADDL R8, SI
MOVL 40(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xbebfbc70, BP
ADDL R8, BP
MOVL 52(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0x289b7ec6, BX
ADDL R8, BX
MOVL (DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xeaa127fa, CX
ADDL R8, CX
MOVL 12(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0xd4ef3085, SI
ADDL R8, SI
MOVL 24(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0x04881d05, BP
ADDL R8, BP
MOVL 36(DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
ADDL $0xd9d4d039, BX
ADDL R8, BX
MOVL 48(DX), R8
XORL CX, R9
XORL BP, R9
ADDL R9, BX
ROLL $0x04, BX
MOVL BP, R9
ADDL BP, BX
ADDL $0xe6db99e5, CX
ADDL R8, CX
MOVL 60(DX), R8
XORL SI, R9
XORL BX, R9
ADDL R9, CX
ROLL $0x0b, CX
MOVL BX, R9
ADDL BX, CX
ADDL $0x1fa27cf8, SI
ADDL R8, SI
MOVL 8(DX), R8
XORL BP, R9
XORL CX, R9
ADDL R9, SI
ROLL $0x10, SI
MOVL CX, R9
ADDL CX, SI
ADDL $0xc4ac5665, BP
ADDL R8, BP
MOVL (DX), R8
XORL BX, R9
XORL SI, R9
ADDL R9, BP
ROLL $0x17, BP
MOVL SI, R9
ADDL SI, BP
// ROUND4
MOVL DI, R9
XORL CX, R9
ADDL $0xf4292244, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 28(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x432aff97, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 56(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xab9423a7, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 20(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xfc93a039, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 48(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x655b59c3, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 12(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0x8f0ccc92, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 40(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xffeff47d, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 4(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x85845dd1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 32(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0x6fa87e4f, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 60(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xfe2ce6e0, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 24(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0xa3014314, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 52(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0x4e0811a1, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
MOVL 16(DX), R8
MOVL DI, R9
ROLL $0x15, BP
XORL CX, R9
ADDL SI, BP
ADDL $0xf7537e82, BX
ADDL R8, BX
ORL BP, R9
XORL SI, R9
ADDL R9, BX
MOVL 44(DX), R8
MOVL DI, R9
ROLL $0x06, BX
XORL SI, R9
ADDL BP, BX
ADDL $0xbd3af235, CX
ADDL R8, CX
ORL BX, R9
XORL BP, R9
ADDL R9, CX
MOVL 8(DX), R8
MOVL DI, R9
ROLL $0x0a, CX
XORL BP, R9
ADDL BX, CX
ADDL $0x2ad7d2bb, SI
ADDL R8, SI
ORL CX, R9
XORL BX, R9
ADDL R9, SI
MOVL 36(DX), R8
MOVL DI, R9
ROLL $0x0f, SI
XORL BX, R9
ADDL CX, SI
ADDL $0xeb86d391, BP
ADDL R8, BP
ORL SI, R9
XORL CX, R9
ADDL R9, BP
ROLL $0x15, BP
ADDL SI, BP
ADDL R10, BX
ADDL R11, BP
ADDL R12, SI
ADDL R13, CX
// Prepare next loop
ADDQ $0x40, DX
CMPQ DX, AX
JB loop
// Write output
MOVQ dig+0(FP), AX
MOVL BX, (AX)
MOVL BP, 4(AX)
MOVL SI, 8(AX)
MOVL CX, 12(AX)
end:
RET

View file

@ -47,9 +47,12 @@ func (c Client) GetObject(ctx context.Context, bucketName, objectName string, op
}
}
var httpReader io.ReadCloser
var objectInfo ObjectInfo
var err error
var (
err error
httpReader io.ReadCloser
objectInfo ObjectInfo
totalRead int
)
// Create request channel.
reqCh := make(chan getRequest)
@ -103,6 +106,7 @@ func (c Client) GetObject(ctx context.Context, bucketName, objectName string, op
// Read at least firstReq.Buffer bytes, if not we have
// reached our EOF.
size, err := readFull(httpReader, req.Buffer)
totalRead += size
if size > 0 && err == io.ErrUnexpectedEOF {
if int64(size) < objectInfo.Size {
// In situations when returned size
@ -125,7 +129,7 @@ func (c Client) GetObject(ctx context.Context, bucketName, objectName string, op
// Send back the first response.
resCh <- getResponse{
objectInfo: objectInfo,
Size: int(size),
Size: size,
Error: err,
didRead: true,
}
@ -202,19 +206,36 @@ func (c Client) GetObject(ctx context.Context, bucketName, objectName string, op
}
return
}
totalRead = 0
}
// Read at least req.Buffer bytes, if not we have
// reached our EOF.
size, err := readFull(httpReader, req.Buffer)
totalRead += size
if size > 0 && err == io.ErrUnexpectedEOF {
// If an EOF happens after reading some but not
// all the bytes ReadFull returns ErrUnexpectedEOF
err = io.EOF
if int64(totalRead) < objectInfo.Size {
// In situations when returned size
// is less than the expected content
// length set by the server, make sure
// we return io.ErrUnexpectedEOF
err = io.ErrUnexpectedEOF
} else {
// If an EOF happens after reading some but not
// all the bytes ReadFull returns ErrUnexpectedEOF
err = io.EOF
}
} else if size == 0 && err == io.EOF && objectInfo.Size > 0 {
// Special cases when server writes more data
// than the content-length, net/http response
// body returns an error, instead of converting
// it to io.EOF - return unexpected EOF.
err = io.ErrUnexpectedEOF
}
// Reply back how much was read.
resCh <- getResponse{
Size: int(size),
Size: size,
Error: err,
didRead: true,
objectInfo: objectInfo,

View file

@ -108,7 +108,7 @@ type Options struct {
// Global constants.
const (
libraryName = "minio-go"
libraryVersion = "v7.0.9"
libraryVersion = "v7.0.10"
)
// User Agent should always following the below style.

View file

@ -22,10 +22,12 @@ import "os"
// A EnvMinio retrieves credentials from the environment variables of the
// running process. EnvMinioironment credentials never expire.
//
// EnvMinioironment variables used:
// Environment variables used:
//
// * Access Key ID: MINIO_ACCESS_KEY.
// * Secret Access Key: MINIO_SECRET_KEY.
// * Access Key ID: MINIO_ROOT_USER.
// * Secret Access Key: MINIO_ROOT_PASSWORD.
type EnvMinio struct {
retrieved bool
}
@ -40,12 +42,16 @@ func NewEnvMinio() *Credentials {
func (e *EnvMinio) Retrieve() (Value, error) {
e.retrieved = false
id := os.Getenv("MINIO_ACCESS_KEY")
secret := os.Getenv("MINIO_SECRET_KEY")
id := os.Getenv("MINIO_ROOT_USER")
secret := os.Getenv("MINIO_ROOT_PASSWORD")
signerType := SignatureV4
if id == "" || secret == "" {
signerType = SignatureAnonymous
id = os.Getenv("MINIO_ACCESS_KEY")
secret = os.Getenv("MINIO_SECRET_KEY")
if id == "" || secret == "" {
signerType = SignatureAnonymous
}
}
e.retrieved = true

View file

@ -1,25 +0,0 @@
sudo: required
dist: trusty
language: go
os:
- linux
go:
- tip
- 1.12.x
env:
- ARCH=x86_64
- ARCH=i686
matrix:
fast_finish: true
allow_failures:
- go: tip
script:
- diff -au <(gofmt -d .) <(printf "")
- go test -race -v ./...
- go vet -asmdecl .
- ./test-architectures.sh

View file

@ -1,14 +1,18 @@
# sha256-simd
Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions and AVX2 for Intel and ARM64 for ARM. On AVX512 it provides an up to 8x improvement (over 3 GB/s per core) in comparison to AVX2. SHA Extensions give a performance boost of close to 4x over AVX2.
Accelerate SHA256 computations in pure Go using AVX512, SHA Extensions for x86 and ARM64 for ARM.
On AVX512 it provides an up to 8x improvement (over 3 GB/s per core).
SHA Extensions give a performance boost of close to 4x over native.
## Introduction
This package is designed as a replacement for `crypto/sha256`. For Intel CPUs it has two flavors for AVX512 and AVX2 (AVX/SSE are also supported). For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement.
This package is designed as a replacement for `crypto/sha256`.
For ARM CPUs with the Cryptography Extensions, advantage is taken of the SHA2 instructions resulting in a massive performance improvement.
This package uses Golang assembly. The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
This package uses Golang assembly.
The AVX512 version is based on the Intel's "multi-buffer crypto library for IPSec" whereas the other Intel implementations are described in "Fast SHA-256 Implementations on Intel Architecture Processors" by J. Guilford et al.
## New: Support for Intel SHA Extensions
## Support for Intel SHA Extensions
Support for the Intel SHA Extensions has been added by Kristofer Peterson (@svenski123), originally developed for spacemeshos [here](https://github.com/spacemeshos/POET/issues/23). On CPUs that support it (known thus far Intel Celeron J3455 and AMD Ryzen) it gives a significant boost in performance (with thanks to @AudriusButkevicius for reporting the results; full results [here](https://github.com/minio/sha256-simd/pull/37#issuecomment-451607827)).
@ -18,7 +22,9 @@ benchmark AVX2 MB/s SHA Ext MB/s speedup
BenchmarkHash5M 514.40 1975.17 3.84x
```
Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding, endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes, the other changes increased everything roughly 50%.
Thanks to Kristofer Peterson, we also added additional performance changes such as optimized padding,
endian conversions which sped up all implementations i.e. Intel SHA alone while doubled performance for small sizes,
the other changes increased everything roughly 50%.
## Support for AVX512
@ -58,7 +64,8 @@ More detailed information can be found in this [blog](https://blog.minio.io/acce
## Drop-In Replacement
The following code snippet shows how you can use `github.com/minio/sha256-simd`. This will automatically select the fastest method for the architecture on which it will be executed.
The following code snippet shows how you can use `github.com/minio/sha256-simd`.
This will automatically select the fastest method for the architecture on which it will be executed.
```go
import "github.com/minio/sha256-simd"
@ -80,9 +87,6 @@ Below is the speed in MB/s for a single core (ranked fast to slow) for blocks la
| 3.0 GHz Intel Xeon Platinum 8124M | AVX512 | 3498 |
| 3.7 GHz AMD Ryzen 7 2700X | SHA Ext | 1979 |
| 1.2 GHz ARM Cortex-A53 | ARM64 | 638 |
| 3.0 GHz Intel Xeon Platinum 8124M | AVX2 | 449 |
| 3.1 GHz Intel Core i7 | AVX | 362 |
| 3.1 GHz Intel Core i7 | SSE | 299 |
## asm2plan9s

View file

@ -1,32 +0,0 @@
# version format
version: "{build}"
# Operating system (build VM template)
os: Windows Server 2012 R2
# Platform.
platform: x64
clone_folder: c:\gopath\src\github.com\minio\sha256-simd
# environment variables
environment:
GOPATH: c:\gopath
GO15VENDOREXPERIMENT: 1
# scripts that run after cloning repository
install:
- set PATH=%GOPATH%\bin;c:\go\bin;%PATH%
- go version
- go env
# to run your custom scripts instead of automatic MSBuild
build_script:
- go test .
- go test -race .
# to disable automatic tests
test: off
# to disable deployment
deploy: off

View file

@ -1,119 +0,0 @@
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package sha256
// True when SIMD instructions are available.
var avx512 bool
var avx2 bool
var avx bool
var sse bool
var sse2 bool
var sse3 bool
var ssse3 bool
var sse41 bool
var sse42 bool
var popcnt bool
var sha bool
var armSha = haveArmSha()
func init() {
var _xsave bool
var _osxsave bool
var _avx bool
var _avx2 bool
var _avx512f bool
var _avx512dq bool
// var _avx512pf bool
// var _avx512er bool
// var _avx512cd bool
var _avx512bw bool
var _avx512vl bool
var _sseState bool
var _avxState bool
var _opmaskState bool
var _zmmHI256State bool
var _hi16ZmmState bool
mfi, _, _, _ := cpuid(0)
if mfi >= 1 {
_, _, c, d := cpuid(1)
sse = (d & (1 << 25)) != 0
sse2 = (d & (1 << 26)) != 0
sse3 = (c & (1 << 0)) != 0
ssse3 = (c & (1 << 9)) != 0
sse41 = (c & (1 << 19)) != 0
sse42 = (c & (1 << 20)) != 0
popcnt = (c & (1 << 23)) != 0
_xsave = (c & (1 << 26)) != 0
_osxsave = (c & (1 << 27)) != 0
_avx = (c & (1 << 28)) != 0
}
if mfi >= 7 {
_, b, _, _ := cpuid(7)
_avx2 = (b & (1 << 5)) != 0
_avx512f = (b & (1 << 16)) != 0
_avx512dq = (b & (1 << 17)) != 0
// _avx512pf = (b & (1 << 26)) != 0
// _avx512er = (b & (1 << 27)) != 0
// _avx512cd = (b & (1 << 28)) != 0
_avx512bw = (b & (1 << 30)) != 0
_avx512vl = (b & (1 << 31)) != 0
sha = (b & (1 << 29)) != 0
}
// Stop here if XSAVE unsupported or not enabled
if !_xsave || !_osxsave {
return
}
if _xsave && _osxsave {
a, _ := xgetbv(0)
_sseState = (a & (1 << 1)) != 0
_avxState = (a & (1 << 2)) != 0
_opmaskState = (a & (1 << 5)) != 0
_zmmHI256State = (a & (1 << 6)) != 0
_hi16ZmmState = (a & (1 << 7)) != 0
} else {
_sseState = true
}
// Very unlikely that OS would enable XSAVE and then disable SSE
if !_sseState {
sse = false
sse2 = false
sse3 = false
ssse3 = false
sse41 = false
sse42 = false
}
if _avxState {
avx = _avx
avx2 = _avx2
}
if _opmaskState && _zmmHI256State && _hi16ZmmState {
avx512 = (_avx512f &&
_avx512dq &&
_avx512bw &&
_avx512vl)
}
}

View file

@ -1,24 +0,0 @@
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package sha256
func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
func xgetbv(index uint32) (eax, edx uint32)
func haveArmSha() bool {
return false
}

View file

@ -1,53 +0,0 @@
// The MIT License (MIT)
//
// Copyright (c) 2015 Klaus Post
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// +build 386,!gccgo
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuid(SB), 7, $0
XORL CX, CX
MOVL op+0(FP), AX
CPUID
MOVL AX, eax+4(FP)
MOVL BX, ebx+8(FP)
MOVL CX, ecx+12(FP)
MOVL DX, edx+16(FP)
RET
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuidex(SB), 7, $0
MOVL op+0(FP), AX
MOVL op2+4(FP), CX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func xgetbv(index uint32) (eax, edx uint32)
TEXT ·xgetbv(SB), 7, $0
MOVL index+0(FP), CX
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
MOVL AX, eax+4(FP)
MOVL DX, edx+8(FP)
RET

View file

@ -1,24 +0,0 @@
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package sha256
func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
func xgetbv(index uint32) (eax, edx uint32)
func haveArmSha() bool {
return false
}

View file

@ -1,53 +0,0 @@
// The MIT License (MIT)
//
// Copyright (c) 2015 Klaus Post
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in all
// copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
// SOFTWARE.
// +build amd64,!gccgo
// func cpuid(op uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuid(SB), 7, $0
XORQ CX, CX
MOVL op+0(FP), AX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32)
TEXT ·cpuidex(SB), 7, $0
MOVL op+0(FP), AX
MOVL op2+4(FP), CX
CPUID
MOVL AX, eax+8(FP)
MOVL BX, ebx+12(FP)
MOVL CX, ecx+16(FP)
MOVL DX, edx+20(FP)
RET
// func xgetbv(index uint32) (eax, edx uint32)
TEXT ·xgetbv(SB), 7, $0
MOVL index+0(FP), CX
BYTE $0x0f; BYTE $0x01; BYTE $0xd0 // XGETBV
MOVL AX, eax+8(FP)
MOVL DX, edx+12(FP)
RET

View file

@ -1,32 +0,0 @@
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package sha256
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
func xgetbv(index uint32) (eax, edx uint32) {
return 0, 0
}
func haveArmSha() bool {
return false
}

View file

@ -1,49 +0,0 @@
// +build arm64,linux
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
package sha256
import (
"bytes"
"io/ioutil"
)
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
func xgetbv(index uint32) (eax, edx uint32) {
return 0, 0
}
// File to check for cpu capabilities.
const procCPUInfo = "/proc/cpuinfo"
// Feature to check for.
const sha256Feature = "sha2"
func haveArmSha() bool {
cpuInfo, err := ioutil.ReadFile(procCPUInfo)
if err != nil {
return false
}
return bytes.Contains(cpuInfo, []byte(sha256Feature))
}

View file

@ -1,4 +1,4 @@
// Minio Cloud Storage, (C) 2016 Minio, Inc.
// Minio Cloud Storage, (C) 2021 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
@ -13,22 +13,34 @@
// limitations under the License.
//
// +build !386,!amd64,!arm,!arm64 arm64,!linux
package sha256
func cpuid(op uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
import (
"bytes"
"io/ioutil"
"runtime"
func cpuidex(op, op2 uint32) (eax, ebx, ecx, edx uint32) {
return 0, 0, 0, 0
}
"github.com/klauspost/cpuid/v2"
)
func xgetbv(index uint32) (eax, edx uint32) {
return 0, 0
}
func hasArmSha2() bool {
if cpuid.CPU.Has(cpuid.SHA2) {
return true
}
if runtime.GOARCH != "arm64" || runtime.GOOS != "linux" {
return false
}
// Fall back to hacky cpuinfo parsing...
const procCPUInfo = "/proc/cpuinfo"
// Feature to check for.
const sha256Feature = "sha2"
cpuInfo, err := ioutil.ReadFile(procCPUInfo)
if err != nil {
return false
}
return bytes.Contains(cpuInfo, []byte(sha256Feature))
func haveArmSha() bool {
return false
}

View file

@ -1,3 +1,5 @@
module github.com/minio/sha256-simd
go 1.12
go 1.13
require github.com/klauspost/cpuid/v2 v2.0.4

4
vendor/github.com/minio/sha256-simd/go.sum generated vendored Normal file
View file

@ -0,0 +1,4 @@
github.com/klauspost/cpuid/v2 v2.0.3 h1:DNljyrHyxlkk8139OXIAAauCwV8eQGDD6Z8YqnDXdZw=
github.com/klauspost/cpuid/v2 v2.0.3/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=
github.com/klauspost/cpuid/v2 v2.0.4 h1:g0I61F2K2DjRHz1cnxlkNSBIaePVoJIjjnHui8QHbiw=
github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg=

View file

@ -21,6 +21,8 @@ import (
"encoding/binary"
"hash"
"runtime"
"github.com/klauspost/cpuid/v2"
)
// Size - The size of a SHA256 checksum in bytes.
@ -67,10 +69,6 @@ type blockfuncType int
const (
blockfuncGeneric blockfuncType = iota
blockfuncAvx512 blockfuncType = iota
blockfuncAvx2 blockfuncType = iota
blockfuncAvx blockfuncType = iota
blockfuncSsse blockfuncType = iota
blockfuncSha blockfuncType = iota
blockfuncArm blockfuncType = iota
)
@ -78,26 +76,24 @@ const (
var blockfunc blockfuncType
func init() {
is386bit := runtime.GOARCH == "386"
isARM := runtime.GOARCH == "arm"
blockfunc = blockfuncGeneric
switch {
case is386bit || isARM:
blockfunc = blockfuncGeneric
case sha && ssse3 && sse41:
case hasSHAExtensions():
blockfunc = blockfuncSha
case avx2:
blockfunc = blockfuncAvx2
case avx:
blockfunc = blockfuncAvx
case ssse3:
blockfunc = blockfuncSsse
case armSha:
case hasArmSha2():
blockfunc = blockfuncArm
default:
blockfunc = blockfuncGeneric
}
}
var avx512 = cpuid.CPU.Supports(cpuid.AVX512F, cpuid.AVX512DQ, cpuid.AVX512BW, cpuid.AVX512VL)
// hasSHAExtensions return whether the cpu supports SHA extensions.
func hasSHAExtensions() bool {
return cpuid.CPU.Supports(cpuid.SHA, cpuid.SSSE3, cpuid.SSE4) && runtime.GOARCH == "amd64"
}
// New returns a new hash.Hash computing the SHA256 checksum.
func New() hash.Hash {
if blockfunc != blockfuncGeneric {
@ -278,12 +274,6 @@ func (d *digest) checkSum() (digest [Size]byte) {
func block(dig *digest, p []byte) {
if blockfunc == blockfuncSha {
blockShaGo(dig, p)
} else if blockfunc == blockfuncAvx2 {
blockAvx2Go(dig, p)
} else if blockfunc == blockfuncAvx {
blockAvxGo(dig, p)
} else if blockfunc == blockfuncSsse {
blockSsseGo(dig, p)
} else if blockfunc == blockfuncArm {
blockArmGo(dig, p)
} else if blockfunc == blockfuncGeneric {

View file

@ -1,22 +0,0 @@
//+build !noasm,!appengine
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package sha256
//go:noescape
func blockAvx2(h []uint32, message []uint8)

File diff suppressed because it is too large Load diff

View file

@ -1,4 +1,4 @@
//+build !noasm,!appengine
//+build !noasm,!appengine,gc
/*
* Minio Cloud Storage, (C) 2017 Minio, Inc.

View file

@ -1,22 +0,0 @@
//+build !noasm,!appengine
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package sha256
//go:noescape
func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)

View file

@ -1,408 +0,0 @@
//+build !noasm,!appengine
// SHA256 implementation for AVX
//
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//
// This code is based on an Intel White-Paper:
// "Fast SHA-256 Implementations on Intel Architecture Processors"
//
// together with the reference implementation from the following authors:
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
//
// For Golang it has been converted to Plan 9 assembly with the help of
// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
// equivalents
//
#include "textflag.h"
#define ROTATE_XS \
MOVOU X4, X15 \
MOVOU X5, X4 \
MOVOU X6, X5 \
MOVOU X7, X6 \
MOVOU X15, X7
// compute s0 four at a time and s1 two at a time
// compute W[-16] + W[-7] 4 at a time
#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
MOVL e, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL a, R14 \ // y1 = a
LONG $0x0f41e3c4; WORD $0x04c6 \ // VPALIGNR XMM0,XMM7,XMM6,0x4 /* XTMP0 = W[-7] */
ROLL $23, R14 \ // y1 = a >> (22-13)
XORL e, R13 \ // y0 = e ^ (e >> (25-11))
MOVL f, R15 \ // y2 = f
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL a, R14 \ // y1 = a ^ (a >> (22-13)
XORL g, R15 \ // y2 = f^g
LONG $0xc4fef9c5 \ // VPADDD XMM0,XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
ANDL e, R15 \ // y2 = (f^g)&e
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
\
\ // compute s0
\
LONG $0x0f51e3c4; WORD $0x04cc \ // VPALIGNR XMM1,XMM5,XMM4,0x4 /* XTMP1 = W[-15] */
XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
MOVL a, R13 \ // y0 = a
ADDL R15, h \ // h = h + S1 + CH + k + w
\ // ROTATE_ARGS
MOVL a, R15 \ // y2 = a
LONG $0xd172e9c5; BYTE $0x07 \ // VPSRLD XMM2,XMM1,0x7 /* */
ORL c, R13 \ // y0 = a|c
ADDL h, d \ // d = d + h + S1 + CH + k + w
ANDL c, R15 \ // y2 = a&c
LONG $0xf172e1c5; BYTE $0x19 \ // VPSLLD XMM3,XMM1,0x19 /* */
ANDL b, R13 \ // y0 = (a|c)&b
ADDL R14, h \ // h = h + S1 + CH + k + w + S0
LONG $0xdaebe1c5 \ // VPOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL d, R13 \ // y0 = e
MOVL h, R14 \ // y1 = a
ROLL $18, R13 \ // y0 = e >> (25-11)
XORL d, R13 \ // y0 = e ^ (e >> (25-11))
MOVL e, R15 \ // y2 = f
ROLL $23, R14 \ // y1 = a >> (22-13)
LONG $0xd172e9c5; BYTE $0x12 \ // VPSRLD XMM2,XMM1,0x12 /* */
XORL h, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL f, R15 \ // y2 = f^g
LONG $0xd172b9c5; BYTE $0x03 \ // VPSRLD XMM8,XMM1,0x3 /* XTMP4 = W[-15] >> 3 */
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL d, R15 \ // y2 = (f^g)&e
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
LONG $0xf172f1c5; BYTE $0x0e \ // VPSLLD XMM1,XMM1,0xe /* */
XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
LONG $0xd9efe1c5 \ // VPXOR XMM3,XMM3,XMM1 /* */
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
LONG $0xdaefe1c5 \ // VPXOR XMM3,XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
MOVL h, R13 \ // y0 = a
ADDL R15, g \ // h = h + S1 + CH + k + w
MOVL h, R15 \ // y2 = a
LONG $0xef61c1c4; BYTE $0xc8 \ // VPXOR XMM1,XMM3,XMM8 /* XTMP1 = s0 */
ORL b, R13 \ // y0 = a|c
ADDL g, c \ // d = d + h + S1 + CH + k + w
ANDL b, R15 \ // y2 = a&c
\
\ // compute low s1
\
LONG $0xd770f9c5; BYTE $0xfa \ // VPSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
ANDL a, R13 \ // y0 = (a|c)&b
ADDL R14, g \ // h = h + S1 + CH + k + w + S0
LONG $0xc1fef9c5 \ // VPADDD XMM0,XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL c, R13 \ // y0 = e
MOVL g, R14 \ // y1 = a
ROLL $18, R13 \ // y0 = e >> (25-11)
XORL c, R13 \ // y0 = e ^ (e >> (25-11))
ROLL $23, R14 \ // y1 = a >> (22-13)
MOVL d, R15 \ // y2 = f
XORL g, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
LONG $0xd272b9c5; BYTE $0x0a \ // VPSRLD XMM8,XMM2,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
XORL e, R15 \ // y2 = f^g
LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL c, R15 \ // y2 = (f^g)&e
LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
ADDL R13, R15 \ // y2 = S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
LONG $0xc2ef39c5 \ // VPXOR XMM8,XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
MOVL g, R13 \ // y0 = a
ADDL R15, f \ // h = h + S1 + CH + k + w
MOVL g, R15 \ // y2 = a
LONG $0x003942c4; BYTE $0xc2 \ // VPSHUFB XMM8,XMM8,XMM10 /* XTMP4 = s1 {00BA} */
ORL a, R13 \ // y0 = a|c
ADDL f, b \ // d = d + h + S1 + CH + k + w
ANDL a, R15 \ // y2 = a&c
LONG $0xfe79c1c4; BYTE $0xc0 \ // VPADDD XMM0,XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
ANDL h, R13 \ // y0 = (a|c)&b
ADDL R14, f \ // h = h + S1 + CH + k + w + S0
\
\ // compute high s1
\
LONG $0xd070f9c5; BYTE $0x50 \ // VPSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL b, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL f, R14 \ // y1 = a
ROLL $23, R14 \ // y1 = a >> (22-13)
XORL b, R13 \ // y0 = e ^ (e >> (25-11))
MOVL c, R15 \ // y2 = f
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
LONG $0xd272a1c5; BYTE $0x0a \ // VPSRLD XMM11,XMM2,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
XORL f, R14 \ // y1 = a ^ (a >> (22-13)
XORL d, R15 \ // y2 = f^g
LONG $0xd273e1c5; BYTE $0x13 \ // VPSRLQ XMM3,XMM2,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL b, R15 \ // y2 = (f^g)&e
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
LONG $0xd273e9c5; BYTE $0x11 \ // VPSRLQ XMM2,XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
LONG $0xd3efe9c5 \ // VPXOR XMM2,XMM2,XMM3 /* */
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
LONG $0xdaef21c5 \ // VPXOR XMM11,XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
MOVL f, R13 \ // y0 = a
ADDL R15, e \ // h = h + S1 + CH + k + w
MOVL f, R15 \ // y2 = a
LONG $0x002142c4; BYTE $0xdc \ // VPSHUFB XMM11,XMM11,XMM12 /* XTMP5 = s1 {DC00} */
ORL h, R13 \ // y0 = a|c
ADDL e, a \ // d = d + h + S1 + CH + k + w
ANDL h, R15 \ // y2 = a&c
LONG $0xe0fea1c5 \ // VPADDD XMM4,XMM11,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
ANDL g, R13 \ // y0 = (a|c)&b
ADDL R14, e \ // h = h + S1 + CH + k + w + S0
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
ROTATE_XS
#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
MOVL e, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL a, R14 \ // y1 = a
XORL e, R13 \ // y0 = e ^ (e >> (25-11))
ROLL $23, R14 \ // y1 = a >> (22-13)
MOVL f, R15 \ // y2 = f
XORL a, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL g, R15 \ // y2 = f^g
XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
ANDL e, R15 \ // y2 = (f^g)&e
XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
ADDL R13, R15 \ // y2 = S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
MOVL a, R13 \ // y0 = a
ADDL R15, h \ // h = h + S1 + CH + k + w
MOVL a, R15 \ // y2 = a
ORL c, R13 \ // y0 = a|c
ADDL h, d \ // d = d + h + S1 + CH + k + w
ANDL c, R15 \ // y2 = a&c
ANDL b, R13 \ // y0 = (a|c)&b
ADDL R14, h \ // h = h + S1 + CH + k + w + S0
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockAvx(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockAvx(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h
MOVQ message_base+24(FP), R8 // &message
MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0
JEQ done_hash
ADDQ R8, R9
MOVQ R9, reserved2+64(FP) // store end of message
// Register definition
// a --> eax
// b --> ebx
// c --> ecx
// d --> r8d
// e --> edx
// f --> r9d
// g --> r10d
// h --> r11d
//
// y0 --> r13d
// y1 --> r14d
// y2 --> r15d
MOVL (0*4)(SI), AX // a = H0
MOVL (1*4)(SI), BX // b = H1
MOVL (2*4)(SI), CX // c = H2
MOVL (3*4)(SI), R8 // d = H3
MOVL (4*4)(SI), DX // e = H4
MOVL (5*4)(SI), R9 // f = H5
MOVL (6*4)(SI), R10 // g = H6
MOVL (7*4)(SI), R11 // h = H7
MOVOU bflipMask<>(SB), X13
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message_base+24(FP), SI // SI: &message
loop0:
LEAQ constants<>(SB), BP
// byte swap first 16 dwords
MOVOU 0*16(SI), X4
LONG $0x0059c2c4; BYTE $0xe5 // VPSHUFB XMM4, XMM4, XMM13
MOVOU 1*16(SI), X5
LONG $0x0051c2c4; BYTE $0xed // VPSHUFB XMM5, XMM5, XMM13
MOVOU 2*16(SI), X6
LONG $0x0049c2c4; BYTE $0xf5 // VPSHUFB XMM6, XMM6, XMM13
MOVOU 3*16(SI), X7
LONG $0x0041c2c4; BYTE $0xfd // VPSHUFB XMM7, XMM7, XMM13
MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI
// schedule 48 input dwords, by doing 3 rounds of 16 each
loop1:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x10 // VPADDD XMM9, XMM4, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
LONG $0x4dfe59c5; BYTE $0x20 // VPADDD XMM9, XMM4, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
LONG $0x4dfe59c5; BYTE $0x30 // VPADDD XMM9, XMM4, 48[RBP] /* Add 4th constant to message */
MOVOU X9, reserved0+48(FP)
ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
SUBQ $1, DI
JNE loop1
MOVD $0x2, DI
loop2:
LONG $0x4dfe59c5; BYTE $0x00 // VPADDD XMM9, XMM4, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
LONG $0x4dfe51c5; BYTE $0x10 // VPADDD XMM9, XMM5, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, reserved0+48(FP)
ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
MOVOU X6, X4
MOVOU X7, X5
SUBQ $1, DI
JNE loop2
MOVQ h+0(FP), SI // SI: &h
ADDL (0*4)(SI), AX // H0 = a + H0
MOVL AX, (0*4)(SI)
ADDL (1*4)(SI), BX // H1 = b + H1
MOVL BX, (1*4)(SI)
ADDL (2*4)(SI), CX // H2 = c + H2
MOVL CX, (2*4)(SI)
ADDL (3*4)(SI), R8 // H3 = d + H3
MOVL R8, (3*4)(SI)
ADDL (4*4)(SI), DX // H4 = e + H4
MOVL DX, (4*4)(SI)
ADDL (5*4)(SI), R9 // H5 = f + H5
MOVL R9, (5*4)(SI)
ADDL (6*4)(SI), R10 // H6 = g + H6
MOVL R10, (6*4)(SI)
ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI)
MOVQ reserved3+72(FP), SI
ADDQ $64, SI
CMPQ reserved2+64(FP), SI
JNE loop0
done_hash:
RET
// Constants table
DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
GLOBL constants<>(SB), 8, $256
GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16

View file

@ -1,4 +1,4 @@
//+build !noasm,!appengine
//+build !noasm,!appengine,gc
package sha256

View file

@ -1,22 +0,0 @@
//+build !noasm,!appengine
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package sha256
//go:noescape
func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)

View file

@ -1,429 +0,0 @@
//+build !noasm,!appengine
// SHA256 implementation for SSSE3
//
// Minio Cloud Storage, (C) 2016 Minio, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
//
// This code is based on an Intel White-Paper:
// "Fast SHA-256 Implementations on Intel Architecture Processors"
//
// together with the reference implementation from the following authors:
// James Guilford <james.guilford@intel.com>
// Kirk Yap <kirk.s.yap@intel.com>
// Tim Chen <tim.c.chen@linux.intel.com>
//
// For Golang it has been converted to Plan 9 assembly with the help of
// github.com/minio/asm2plan9s to assemble Intel instructions to their Plan9
// equivalents
//
#include "textflag.h"
#define ROTATE_XS \
MOVOU X4, X15 \
MOVOU X5, X4 \
MOVOU X6, X5 \
MOVOU X7, X6 \
MOVOU X15, X7
// compute s0 four at a time and s1 two at a time
// compute W[-16] + W[-7] 4 at a time
#define FOUR_ROUNDS_AND_SCHED(a, b, c, d, e, f, g, h) \
MOVL e, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL a, R14 \ // y1 = a
MOVOU X7, X0 \
LONG $0x0f3a0f66; WORD $0x04c6 \ // PALIGNR XMM0,XMM6,0x4 /* XTMP0 = W[-7] */
ROLL $23, R14 \ // y1 = a >> (22-13)
XORL e, R13 \ // y0 = e ^ (e >> (25-11))
MOVL f, R15 \ // y2 = f
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL a, R14 \ // y1 = a ^ (a >> (22-13)
XORL g, R15 \ // y2 = f^g
LONG $0xc4fe0f66 \ // PADDD XMM0,XMM4 /* XTMP0 = W[-7] + W[-16] */
XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6) )
ANDL e, R15 \ // y2 = (f^g)&e
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
\
\ // compute s0
\
MOVOU X5, X1 \
LONG $0x0f3a0f66; WORD $0x04cc \ // PALIGNR XMM1,XMM4,0x4 /* XTMP1 = W[-15] */
XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+48(FP), R15 \ // y2 = k + w + S1 + CH
MOVL a, R13 \ // y0 = a
ADDL R15, h \ // h = h + S1 + CH + k + w
\ // ROTATE_ARGS
MOVL a, R15 \ // y2 = a
MOVOU X1, X2 \
LONG $0xd2720f66; BYTE $0x07 \ // PSRLD XMM2,0x7 /* */
ORL c, R13 \ // y0 = a|c
ADDL h, d \ // d = d + h + S1 + CH + k + w
ANDL c, R15 \ // y2 = a&c
MOVOU X1, X3 \
LONG $0xf3720f66; BYTE $0x19 \ // PSLLD XMM3,0x19 /* */
ANDL b, R13 \ // y0 = (a|c)&b
ADDL R14, h \ // h = h + S1 + CH + k + w + S0
LONG $0xdaeb0f66 \ // POR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, h \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL d, R13 \ // y0 = e
MOVL h, R14 \ // y1 = a
ROLL $18, R13 \ // y0 = e >> (25-11)
XORL d, R13 \ // y0 = e ^ (e >> (25-11))
MOVL e, R15 \ // y2 = f
ROLL $23, R14 \ // y1 = a >> (22-13)
MOVOU X1, X2 \
LONG $0xd2720f66; BYTE $0x12 \ // PSRLD XMM2,0x12 /* */
XORL h, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL f, R15 \ // y2 = f^g
MOVOU X1, X8 \
LONG $0x720f4166; WORD $0x03d0 \ // PSRLD XMM8,0x3 /* XTMP4 = W[-15] >> 3 */
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
XORL d, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL d, R15 \ // y2 = (f^g)&e
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
LONG $0xf1720f66; BYTE $0x0e \ // PSLLD XMM1,0xe /* */
XORL h, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
XORL f, R15 \ // y2 = CH = ((f^g)&e)^g
LONG $0xd9ef0f66 \ // PXOR XMM3,XMM1 /* */
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+52(FP), R15 \ // y2 = k + w + S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
LONG $0xdaef0f66 \ // PXOR XMM3,XMM2 /* XTMP1 = W[-15] MY_ROR 7 ^ W[-15] MY_ROR */
MOVL h, R13 \ // y0 = a
ADDL R15, g \ // h = h + S1 + CH + k + w
MOVL h, R15 \ // y2 = a
MOVOU X3, X1 \
LONG $0xef0f4166; BYTE $0xc8 \ // PXOR XMM1,XMM8 /* XTMP1 = s0 */
ORL b, R13 \ // y0 = a|c
ADDL g, c \ // d = d + h + S1 + CH + k + w
ANDL b, R15 \ // y2 = a&c
\
\ // compute low s1
\
LONG $0xd7700f66; BYTE $0xfa \ // PSHUFD XMM2,XMM7,0xfa /* XTMP2 = W[-2] {BBAA} */
ANDL a, R13 \ // y0 = (a|c)&b
ADDL R14, g \ // h = h + S1 + CH + k + w + S0
LONG $0xc1fe0f66 \ // PADDD XMM0,XMM1 /* XTMP0 = W[-16] + W[-7] + s0 */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, g \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL c, R13 \ // y0 = e
MOVL g, R14 \ // y1 = a
ROLL $18, R13 \ // y0 = e >> (25-11)
XORL c, R13 \ // y0 = e ^ (e >> (25-11))
ROLL $23, R14 \ // y1 = a >> (22-13)
MOVL d, R15 \ // y2 = f
XORL g, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
MOVOU X2, X8 \
LONG $0x720f4166; WORD $0x0ad0 \ // PSRLD XMM8,0xa /* XTMP4 = W[-2] >> 10 {BBAA} */
XORL e, R15 \ // y2 = f^g
MOVOU X2, X3 \
LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xBxA} */
XORL c, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL c, R15 \ // y2 = (f^g)&e
LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xBxA} */
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
XORL g, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
XORL e, R15 \ // y2 = CH = ((f^g)&e)^g
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
ADDL R13, R15 \ // y2 = S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL _xfer+56(FP), R15 \ // y2 = k + w + S1 + CH
LONG $0xef0f4466; BYTE $0xc2 \ // PXOR XMM8,XMM2 /* XTMP4 = s1 {xBxA} */
MOVL g, R13 \ // y0 = a
ADDL R15, f \ // h = h + S1 + CH + k + w
MOVL g, R15 \ // y2 = a
LONG $0x380f4566; WORD $0xc200 \ // PSHUFB XMM8,XMM10 /* XTMP4 = s1 {00BA} */
ORL a, R13 \ // y0 = a|c
ADDL f, b \ // d = d + h + S1 + CH + k + w
ANDL a, R15 \ // y2 = a&c
LONG $0xfe0f4166; BYTE $0xc0 \ // PADDD XMM0,XMM8 /* XTMP0 = {..., ..., W[1], W[0]} */
ANDL h, R13 \ // y0 = (a|c)&b
ADDL R14, f \ // h = h + S1 + CH + k + w + S0
\
\ // compute high s1
\
LONG $0xd0700f66; BYTE $0x50 \ // PSHUFD XMM2,XMM0,0x50 /* XTMP2 = W[-2] {DDCC} */
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, f \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
MOVL b, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL f, R14 \ // y1 = a
ROLL $23, R14 \ // y1 = a >> (22-13)
XORL b, R13 \ // y0 = e ^ (e >> (25-11))
MOVL c, R15 \ // y2 = f
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
MOVOU X2, X11 \
LONG $0x720f4166; WORD $0x0ad3 \ // PSRLD XMM11,0xa /* XTMP5 = W[-2] >> 10 {DDCC} */
XORL f, R14 \ // y1 = a ^ (a >> (22-13)
XORL d, R15 \ // y2 = f^g
MOVOU X2, X3 \
LONG $0xd3730f66; BYTE $0x13 \ // PSRLQ XMM3,0x13 /* XTMP3 = W[-2] MY_ROR 19 {xDxC} */
XORL b, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ANDL b, R15 \ // y2 = (f^g)&e
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
LONG $0xd2730f66; BYTE $0x11 \ // PSRLQ XMM2,0x11 /* XTMP2 = W[-2] MY_ROR 17 {xDxC} */
XORL f, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL d, R15 \ // y2 = CH = ((f^g)&e)^g
LONG $0xd3ef0f66 \ // PXOR XMM2,XMM3 /* */
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL R13, R15 \ // y2 = S1 + CH
ADDL _xfer+60(FP), R15 \ // y2 = k + w + S1 + CH
LONG $0xef0f4466; BYTE $0xda \ // PXOR XMM11,XMM2 /* XTMP5 = s1 {xDxC} */
MOVL f, R13 \ // y0 = a
ADDL R15, e \ // h = h + S1 + CH + k + w
MOVL f, R15 \ // y2 = a
LONG $0x380f4566; WORD $0xdc00 \ // PSHUFB XMM11,XMM12 /* XTMP5 = s1 {DC00} */
ORL h, R13 \ // y0 = a|c
ADDL e, a \ // d = d + h + S1 + CH + k + w
ANDL h, R15 \ // y2 = a&c
MOVOU X11, X4 \
LONG $0xe0fe0f66 \ // PADDD XMM4,XMM0 /* X0 = {W[3], W[2], W[1], W[0]} */
ANDL g, R13 \ // y0 = (a|c)&b
ADDL R14, e \ // h = h + S1 + CH + k + w + S0
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, e \ // h = h + S1 + CH + k + w + S0 + MAJ
\ // ROTATE_ARGS
ROTATE_XS
#define DO_ROUND(a, b, c, d, e, f, g, h, offset) \
MOVL e, R13 \ // y0 = e
ROLL $18, R13 \ // y0 = e >> (25-11)
MOVL a, R14 \ // y1 = a
XORL e, R13 \ // y0 = e ^ (e >> (25-11))
ROLL $23, R14 \ // y1 = a >> (22-13)
MOVL f, R15 \ // y2 = f
XORL a, R14 \ // y1 = a ^ (a >> (22-13)
ROLL $27, R13 \ // y0 = (e >> (11-6)) ^ (e >> (25-6))
XORL g, R15 \ // y2 = f^g
XORL e, R13 \ // y0 = e ^ (e >> (11-6)) ^ (e >> (25-6))
ROLL $21, R14 \ // y1 = (a >> (13-2)) ^ (a >> (22-2))
ANDL e, R15 \ // y2 = (f^g)&e
XORL a, R14 \ // y1 = a ^ (a >> (13-2)) ^ (a >> (22-2))
ROLL $26, R13 \ // y0 = S1 = (e>>6) & (e>>11) ^ (e>>25)
XORL g, R15 \ // y2 = CH = ((f^g)&e)^g
ADDL R13, R15 \ // y2 = S1 + CH
ROLL $30, R14 \ // y1 = S0 = (a>>2) ^ (a>>13) ^ (a>>22)
ADDL _xfer+offset(FP), R15 \ // y2 = k + w + S1 + CH
MOVL a, R13 \ // y0 = a
ADDL R15, h \ // h = h + S1 + CH + k + w
MOVL a, R15 \ // y2 = a
ORL c, R13 \ // y0 = a|c
ADDL h, d \ // d = d + h + S1 + CH + k + w
ANDL c, R15 \ // y2 = a&c
ANDL b, R13 \ // y0 = (a|c)&b
ADDL R14, h \ // h = h + S1 + CH + k + w + S0
ORL R15, R13 \ // y0 = MAJ = (a|c)&b)|(a&c)
ADDL R13, h // h = h + S1 + CH + k + w + S0 + MAJ
// func blockSsse(h []uint32, message []uint8, reserved0, reserved1, reserved2, reserved3 uint64)
TEXT ·blockSsse(SB), 7, $0-80
MOVQ h+0(FP), SI // SI: &h
MOVQ message_base+24(FP), R8 // &message
MOVQ message_len+32(FP), R9 // length of message
CMPQ R9, $0
JEQ done_hash
ADDQ R8, R9
MOVQ R9, reserved2+64(FP) // store end of message
// Register definition
// a --> eax
// b --> ebx
// c --> ecx
// d --> r8d
// e --> edx
// f --> r9d
// g --> r10d
// h --> r11d
//
// y0 --> r13d
// y1 --> r14d
// y2 --> r15d
MOVL (0*4)(SI), AX // a = H0
MOVL (1*4)(SI), BX // b = H1
MOVL (2*4)(SI), CX // c = H2
MOVL (3*4)(SI), R8 // d = H3
MOVL (4*4)(SI), DX // e = H4
MOVL (5*4)(SI), R9 // f = H5
MOVL (6*4)(SI), R10 // g = H6
MOVL (7*4)(SI), R11 // h = H7
MOVOU bflipMask<>(SB), X13
MOVOU shuf00BA<>(SB), X10 // shuffle xBxA -> 00BA
MOVOU shufDC00<>(SB), X12 // shuffle xDxC -> DC00
MOVQ message_base+24(FP), SI // SI: &message
loop0:
LEAQ constants<>(SB), BP
// byte swap first 16 dwords
MOVOU 0*16(SI), X4
LONG $0x380f4166; WORD $0xe500 // PSHUFB XMM4, XMM13
MOVOU 1*16(SI), X5
LONG $0x380f4166; WORD $0xed00 // PSHUFB XMM5, XMM13
MOVOU 2*16(SI), X6
LONG $0x380f4166; WORD $0xf500 // PSHUFB XMM6, XMM13
MOVOU 3*16(SI), X7
LONG $0x380f4166; WORD $0xfd00 // PSHUFB XMM7, XMM13
MOVQ SI, reserved3+72(FP)
MOVD $0x3, DI
// Align
// nop WORD PTR [rax+rax*1+0x0]
// schedule 48 input dwords, by doing 3 rounds of 16 each
loop1:
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x204d // PADDD XMM9, 32[RBP] /* Add 3rd constant to message */
MOVOU X9, reserved0+48(FP)
FOUR_ROUNDS_AND_SCHED(AX, BX, CX, R8, DX, R9, R10, R11)
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x304d // PADDD XMM9, 48[RBP] /* Add 4th constant to message */
MOVOU X9, reserved0+48(FP)
ADDQ $64, BP
FOUR_ROUNDS_AND_SCHED(DX, R9, R10, R11, AX, BX, CX, R8)
SUBQ $1, DI
JNE loop1
MOVD $0x2, DI
loop2:
MOVOU X4, X9
LONG $0xfe0f4466; WORD $0x004d // PADDD XMM9, 0[RBP] /* Add 1st constant to first part of message */
MOVOU X9, reserved0+48(FP)
DO_ROUND( AX, BX, CX, R8, DX, R9, R10, R11, 48)
DO_ROUND(R11, AX, BX, CX, R8, DX, R9, R10, 52)
DO_ROUND(R10, R11, AX, BX, CX, R8, DX, R9, 56)
DO_ROUND( R9, R10, R11, AX, BX, CX, R8, DX, 60)
MOVOU X5, X9
LONG $0xfe0f4466; WORD $0x104d // PADDD XMM9, 16[RBP] /* Add 2nd constant to message */
MOVOU X9, reserved0+48(FP)
ADDQ $32, BP
DO_ROUND( DX, R9, R10, R11, AX, BX, CX, R8, 48)
DO_ROUND( R8, DX, R9, R10, R11, AX, BX, CX, 52)
DO_ROUND( CX, R8, DX, R9, R10, R11, AX, BX, 56)
DO_ROUND( BX, CX, R8, DX, R9, R10, R11, AX, 60)
MOVOU X6, X4
MOVOU X7, X5
SUBQ $1, DI
JNE loop2
MOVQ h+0(FP), SI // SI: &h
ADDL (0*4)(SI), AX // H0 = a + H0
MOVL AX, (0*4)(SI)
ADDL (1*4)(SI), BX // H1 = b + H1
MOVL BX, (1*4)(SI)
ADDL (2*4)(SI), CX // H2 = c + H2
MOVL CX, (2*4)(SI)
ADDL (3*4)(SI), R8 // H3 = d + H3
MOVL R8, (3*4)(SI)
ADDL (4*4)(SI), DX // H4 = e + H4
MOVL DX, (4*4)(SI)
ADDL (5*4)(SI), R9 // H5 = f + H5
MOVL R9, (5*4)(SI)
ADDL (6*4)(SI), R10 // H6 = g + H6
MOVL R10, (6*4)(SI)
ADDL (7*4)(SI), R11 // H7 = h + H7
MOVL R11, (7*4)(SI)
MOVQ reserved3+72(FP), SI
ADDQ $64, SI
CMPQ reserved2+64(FP), SI
JNE loop0
done_hash:
RET
// Constants table
DATA constants<>+0x0(SB)/8, $0x71374491428a2f98
DATA constants<>+0x8(SB)/8, $0xe9b5dba5b5c0fbcf
DATA constants<>+0x10(SB)/8, $0x59f111f13956c25b
DATA constants<>+0x18(SB)/8, $0xab1c5ed5923f82a4
DATA constants<>+0x20(SB)/8, $0x12835b01d807aa98
DATA constants<>+0x28(SB)/8, $0x550c7dc3243185be
DATA constants<>+0x30(SB)/8, $0x80deb1fe72be5d74
DATA constants<>+0x38(SB)/8, $0xc19bf1749bdc06a7
DATA constants<>+0x40(SB)/8, $0xefbe4786e49b69c1
DATA constants<>+0x48(SB)/8, $0x240ca1cc0fc19dc6
DATA constants<>+0x50(SB)/8, $0x4a7484aa2de92c6f
DATA constants<>+0x58(SB)/8, $0x76f988da5cb0a9dc
DATA constants<>+0x60(SB)/8, $0xa831c66d983e5152
DATA constants<>+0x68(SB)/8, $0xbf597fc7b00327c8
DATA constants<>+0x70(SB)/8, $0xd5a79147c6e00bf3
DATA constants<>+0x78(SB)/8, $0x1429296706ca6351
DATA constants<>+0x80(SB)/8, $0x2e1b213827b70a85
DATA constants<>+0x88(SB)/8, $0x53380d134d2c6dfc
DATA constants<>+0x90(SB)/8, $0x766a0abb650a7354
DATA constants<>+0x98(SB)/8, $0x92722c8581c2c92e
DATA constants<>+0xa0(SB)/8, $0xa81a664ba2bfe8a1
DATA constants<>+0xa8(SB)/8, $0xc76c51a3c24b8b70
DATA constants<>+0xb0(SB)/8, $0xd6990624d192e819
DATA constants<>+0xb8(SB)/8, $0x106aa070f40e3585
DATA constants<>+0xc0(SB)/8, $0x1e376c0819a4c116
DATA constants<>+0xc8(SB)/8, $0x34b0bcb52748774c
DATA constants<>+0xd0(SB)/8, $0x4ed8aa4a391c0cb3
DATA constants<>+0xd8(SB)/8, $0x682e6ff35b9cca4f
DATA constants<>+0xe0(SB)/8, $0x78a5636f748f82ee
DATA constants<>+0xe8(SB)/8, $0x8cc7020884c87814
DATA constants<>+0xf0(SB)/8, $0xa4506ceb90befffa
DATA constants<>+0xf8(SB)/8, $0xc67178f2bef9a3f7
DATA bflipMask<>+0x00(SB)/8, $0x0405060700010203
DATA bflipMask<>+0x08(SB)/8, $0x0c0d0e0f08090a0b
DATA shuf00BA<>+0x00(SB)/8, $0x0b0a090803020100
DATA shuf00BA<>+0x08(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shufDC00<>+0x00(SB)/8, $0xFFFFFFFFFFFFFFFF
DATA shufDC00<>+0x08(SB)/8, $0x0b0a090803020100
GLOBL constants<>(SB), 8, $256
GLOBL bflipMask<>(SB), (NOPTR+RODATA), $16
GLOBL shuf00BA<>(SB), (NOPTR+RODATA), $16
GLOBL shufDC00<>(SB), (NOPTR+RODATA), $16

View file

@ -1,4 +1,4 @@
//+build !noasm,!appengine
//+build !noasm,!appengine,gc
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
@ -18,36 +18,10 @@
package sha256
func blockArmGo(dig *digest, p []byte) {}
func blockAvxGo(dig *digest, p []byte) {
h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
blockAvx(h[:], p[:], 0, 0, 0, 0)
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
}
func blockAvx2Go(dig *digest, p []byte) {
h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
blockAvx2(h[:], p[:])
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
}
func blockSsseGo(dig *digest, p []byte) {
h := []uint32{dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7]}
blockSsse(h[:], p[:], 0, 0, 0, 0)
dig.h[0], dig.h[1], dig.h[2], dig.h[3], dig.h[4], dig.h[5], dig.h[6], dig.h[7] = h[0], h[1], h[2], h[3], h[4], h[5], h[6], h[7]
func blockArmGo(dig *digest, p []byte) {
panic("blockArmGo called unexpectedly")
}
func blockShaGo(dig *digest, p []byte) {
blockSha(&dig.h, p)
}

View file

@ -1,4 +1,4 @@
//+build !noasm,!appengine
//+build !noasm,!appengine,gc
/*
* Minio Cloud Storage, (C) 2016 Minio, Inc.
@ -18,10 +18,9 @@
package sha256
func blockAvx2Go(dig *digest, p []byte) {}
func blockAvxGo(dig *digest, p []byte) {}
func blockSsseGo(dig *digest, p []byte) {}
func blockShaGo(dig *digest, p []byte) {}
func blockShaGo(dig *digest, p []byte) {
panic("blockShaGoc called unexpectedly")
}
//go:noescape
func blockArm(h []uint32, message []uint8)

View file

@ -1,4 +1,4 @@
//+build appengine noasm !amd64,!arm64
//+build appengine noasm !amd64,!arm64 !gc
/*
* Minio Cloud Storage, (C) 2019 Minio, Inc.
@ -18,8 +18,11 @@
package sha256
func blockAvx2Go(dig *digest, p []byte) {}
func blockAvxGo(dig *digest, p []byte) {}
func blockSsseGo(dig *digest, p []byte) {}
func blockShaGo(dig *digest, p []byte) {}
func blockArmGo(dig *digest, p []byte) {}
func blockShaGo(dig *digest, p []byte) {
panic("blockShaGo called unexpectedly")
}
func blockArmGo(dig *digest, p []byte) {
panic("blockArmGo called unexpectedly")
}