update: macaron cores,gzip,session (#10522)

Co-authored-by: zeripath <art27@cantab.net>
2020-02-28 10:51:18 +01:00 · 2020-02-28 10:51:18 +01:00 · 8d2059a201
commit 8d2059a201
parent 694f44660f
47 changed files with 2154 additions and 349 deletions
--- a/vendor/github.com/klauspost/compress/flate/deflate.go
+++ b/vendor/github.com/klauspost/compress/flate/deflate.go
@ -48,6 +48,8 @@ const (
 	maxHashOffset       = 1 << 24

 	skipNever = math.MaxInt32
+
+	debugDeflate = false
 )

 type compressionLevel struct {
@ -59,15 +61,13 @@ type compressionLevel struct {
 // See https://blog.klauspost.com/rebalancing-deflate-compression-levels/
 var levels = []compressionLevel{
 	{}, // 0
-	// Level 1-4 uses specialized algorithm - values not used
+	// Level 1-6 uses specialized algorithm - values not used
 	{0, 0, 0, 0, 0, 1},
 	{0, 0, 0, 0, 0, 2},
 	{0, 0, 0, 0, 0, 3},
 	{0, 0, 0, 0, 0, 4},
-	// For levels 5-6 we don't bother trying with lazy matches.
-	// Lazy matching is at least 30% slower, with 1.5% increase.
-	{6, 0, 12, 8, 12, 5},
-	{8, 0, 24, 16, 16, 6},
+	{0, 0, 0, 0, 0, 5},
+	{0, 0, 0, 0, 0, 6},
 	// Levels 7-9 use increasingly more lazy matching
 	// and increasingly stringent conditions for "good enough".
 	{8, 8, 24, 16, skipNever, 7},
@ -203,9 +203,8 @@ func (d *compressor) writeBlockSkip(tok *tokens, index int, eof bool) error {
 // This is much faster than doing a full encode.
 // Should only be used after a start/reset.
 func (d *compressor) fillWindow(b []byte) {
-	// Do not fill window if we are in store-only mode,
-	// use constant or Snappy compression.
-	if d.level == 0 {
+	// Do not fill window if we are in store-only or huffman mode.
+	if d.level <= 0 {
 		return
 	}
 	if d.fast != nil {
@ -368,7 +367,7 @@ func (d *compressor) deflateLazy() {
 	// Sanity enables additional runtime tests.
 	// It's intended to be used during development
 	// to supplement the currently ad-hoc unit tests.
-	const sanity = false
+	const sanity = debugDeflate

 	if d.windowEnd-s.index < minMatchLength+maxMatchLength && !d.sync {
 		return
@ -644,7 +643,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).store
 	case level == ConstantCompression:
-		d.w.logReusePenalty = uint(4)
+		d.w.logNewTablePenalty = 4
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeHuff
@ -652,13 +651,13 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 		level = 5
 		fallthrough
 	case level >= 1 && level <= 6:
-		d.w.logReusePenalty = uint(level + 1)
+		d.w.logNewTablePenalty = 6
 		d.fast = newFastEnc(level)
 		d.window = make([]byte, maxStoreBlockSize)
 		d.fill = (*compressor).fillBlock
 		d.step = (*compressor).storeFast
 	case 7 <= level && level <= 9:
-		d.w.logReusePenalty = uint(level)
+		d.w.logNewTablePenalty = 10
 		d.state = &advancedState{}
 		d.compressionLevel = levels[level]
 		d.initDeflate()
@ -667,6 +666,7 @@ func (d *compressor) init(w io.Writer, level int) (err error) {
 	default:
 		return fmt.Errorf("flate: invalid compression level %d: want value in range [-2, 9]", level)
 	}
+	d.level = level
 	return nil
 }

@ -720,6 +720,7 @@ func (d *compressor) close() error {
 		return d.w.err
 	}
 	d.w.flush()
+	d.w.reset(nil)
 	return d.w.err
 }

@ -750,8 +751,7 @@ func NewWriter(w io.Writer, level int) (*Writer, error) {
 // can only be decompressed by a Reader initialized with the
 // same dictionary.
 func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
-	dw := &dictWriter{w}
-	zw, err := NewWriter(dw, level)
+	zw, err := NewWriter(w, level)
 	if err != nil {
 		return nil, err
 	}
@ -760,14 +760,6 @@ func NewWriterDict(w io.Writer, level int, dict []byte) (*Writer, error) {
 	return zw, err
 }

-type dictWriter struct {
-	w io.Writer
-}
-
-func (w *dictWriter) Write(b []byte) (n int, err error) {
-	return w.w.Write(b)
-}
-
 // A Writer takes data written to it and writes the compressed
 // form of that data to an underlying writer (see NewWriter).
 type Writer struct {
@ -805,11 +797,12 @@ func (w *Writer) Close() error {
 // the result of NewWriter or NewWriterDict called with dst
 // and w's level and dictionary.
 func (w *Writer) Reset(dst io.Writer) {
-	if dw, ok := w.d.w.writer.(*dictWriter); ok {
+	if len(w.dict) > 0 {
 		// w was created with NewWriterDict
-		dw.w = dst
-		w.d.reset(dw)
-		w.d.fillWindow(w.dict)
+		w.d.reset(dst)
+		if dst != nil {
+			w.d.fillWindow(w.dict)
+		}
 	} else {
 		// w was created with NewWriter
 		w.d.reset(dst)
--- a/vendor/github.com/klauspost/compress/flate/fast_encoder.go
+++ b/vendor/github.com/klauspost/compress/flate/fast_encoder.go
@ -35,17 +35,17 @@ func newFastEnc(level int) fastEnc {
 }

 const (
-	tableBits       = 16             // Bits used in the table
+	tableBits       = 15             // Bits used in the table
 	tableSize       = 1 << tableBits // Size of the table
 	tableShift      = 32 - tableBits // Right-shift to get the tableBits most significant bits of a uint32.
 	baseMatchOffset = 1              // The smallest match offset
 	baseMatchLength = 3              // The smallest match length per the RFC section 3.2.5
 	maxMatchOffset  = 1 << 15        // The largest match offset

-	bTableBits   = 18                                           // Bits used in the big tables
-	bTableSize   = 1 << bTableBits                              // Size of the table
-	allocHistory = maxMatchOffset * 10                          // Size to preallocate for history.
-	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize // Reset the buffer offset when reaching this.
+	bTableBits   = 17                                               // Bits used in the big tables
+	bTableSize   = 1 << bTableBits                                  // Size of the table
+	allocHistory = maxStoreBlockSize * 10                           // Size to preallocate for history.
+	bufferReset  = (1 << 31) - allocHistory - maxStoreBlockSize - 1 // Reset the buffer offset when reaching this.
 )

 const (
@ -92,7 +92,6 @@ func hash(u uint32) uint32 {
 }

 type tableEntry struct {
-	val    uint32
 	offset int32
 }

@ -210,16 +209,14 @@ func (e *fastGen) matchlenLong(s, t int32, src []byte) int32 {

 // Reset the encoding table.
 func (e *fastGen) Reset() {
-	if cap(e.hist) < int(maxMatchOffset*8) {
-		l := maxMatchOffset * 8
-		// Make it at least 1MB.
-		if l < 1<<20 {
-			l = 1 << 20
-		}
-		e.hist = make([]byte, 0, l)
+	if cap(e.hist) < allocHistory {
+		e.hist = make([]byte, 0, allocHistory)
+	}
+	// We offset current position so everything will be out of reach.
+	// If we are above the buffer reset it will be cleared anyway since len(hist) == 0.
+	if e.cur <= bufferReset {
+		e.cur += maxMatchOffset + int32(len(e.hist))
 	}
-	// We offset current position so everything will be out of reach
-	e.cur += maxMatchOffset + int32(len(e.hist))
 	e.hist = e.hist[:0]
 }

--- a/vendor/github.com/klauspost/compress/flate/gen_inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/gen_inflate.go
@ -0,0 +1,274 @@
+// +build generate
+
+//go:generate go run $GOFILE && gofmt -w inflate_gen.go
+
+package main
+
+import (
+	"os"
+	"strings"
+)
+
+func main() {
+	f, err := os.Create("inflate_gen.go")
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
+	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
+	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
+	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+`)
+
+	for _, imp := range imports {
+		f.WriteString("\t\"" + imp + "\"\n")
+	}
+	f.WriteString(")\n\n")
+
+	template := `
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) $FUNCNAME$() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.($TYPE$)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).$FUNCNAME$
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+`
+	for i, t := range types {
+		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
+		s = strings.Replace(s, "$TYPE$", t, -1)
+		f.WriteString(s)
+	}
+	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
+	f.WriteString("\tswitch f.r.(type) {\n")
+	for i, t := range types {
+		f.WriteString("\t\tcase " + t + ":\n")
+		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
+	}
+	f.WriteString("\t\tdefault:\n")
+	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
+	f.WriteString("\t}\n}\n")
+}
--- a/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_bit_writer.go
@ -93,12 +93,12 @@ type huffmanBitWriter struct {
 	err             error
 	lastHeader      int
 	// Set between 0 (reused block can be up to 2x the size)
-	logReusePenalty uint
-	lastHuffMan     bool
-	bytes           [256]byte
-	literalFreq     [lengthCodesStart + 32]uint16
-	offsetFreq      [32]uint16
-	codegenFreq     [codegenCodeCount]uint16
+	logNewTablePenalty uint
+	lastHuffMan        bool
+	bytes              [256]byte
+	literalFreq        [lengthCodesStart + 32]uint16
+	offsetFreq         [32]uint16
+	codegenFreq        [codegenCodeCount]uint16

 	// codegen must have an extra space for the final symbol.
 	codegen [literalCount + offsetCodeCount + 1]uint8
@ -119,7 +119,7 @@ type huffmanBitWriter struct {
 // If lastHuffMan is set, a table for outputting literals has been generated and offsets are invalid.
 //
 // An incoming block estimates the output size of a new table using a 'fresh' by calculating the
-// optimal size and adding a penalty in 'logReusePenalty'.
+// optimal size and adding a penalty in 'logNewTablePenalty'.
 // A Huffman table is not optimal, which is why we add a penalty, and generating a new table
 // is slower both for compression and decompression.

@ -135,7 +135,6 @@ func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
 func (w *huffmanBitWriter) reset(writer io.Writer) {
 	w.writer = writer
 	w.bits, w.nbits, w.nbytes, w.err = 0, 0, 0, nil
-	w.bytes = [256]byte{}
 	w.lastHeader = 0
 	w.lastHuffMan = false
 }
@ -178,6 +177,11 @@ func (w *huffmanBitWriter) flush() {
 		w.nbits = 0
 		return
 	}
+	if w.lastHeader > 0 {
+		// We owe an EOB
+		w.writeCode(w.literalEncoding.codes[endBlockMarker])
+		w.lastHeader = 0
+	}
 	n := w.nbytes
 	for w.nbits != 0 {
 		w.bytes[n] = byte(w.bits)
@ -350,6 +354,13 @@ func (w *huffmanBitWriter) headerSize() (size, numCodegens int) {
 		int(w.codegenFreq[18])*7, numCodegens
 }

+// dynamicSize returns the size of dynamically encoded data in bits.
+func (w *huffmanBitWriter) dynamicReuseSize(litEnc, offEnc *huffmanEncoder) (size int) {
+	size = litEnc.bitLength(w.literalFreq[:]) +
+		offEnc.bitLength(w.offsetFreq[:])
+	return size
+}
+
 // dynamicSize returns the size of dynamically encoded data in bits.
 func (w *huffmanBitWriter) dynamicSize(litEnc, offEnc *huffmanEncoder, extraBits int) (size, numCodegens int) {
 	header, numCodegens := w.headerSize()
@ -452,30 +463,30 @@ func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, n

 	i := 0
 	for {
-		var codeWord int = int(w.codegen[i])
+		var codeWord = uint32(w.codegen[i])
 		i++
 		if codeWord == badCode {
 			break
 		}
-		w.writeCode(w.codegenEncoding.codes[uint32(codeWord)])
+		w.writeCode(w.codegenEncoding.codes[codeWord])

 		switch codeWord {
 		case 16:
 			w.writeBits(int32(w.codegen[i]), 2)
 			i++
-			break
 		case 17:
 			w.writeBits(int32(w.codegen[i]), 3)
 			i++
-			break
 		case 18:
 			w.writeBits(int32(w.codegen[i]), 7)
 			i++
-			break
 		}
 	}
 }

+// writeStoredHeader will write a stored header.
+// If the stored block is only used for EOF,
+// it is replaced with a fixed huffman block.
 func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 	if w.err != nil {
 		return
@ -485,6 +496,16 @@ func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
 	}
+
+	// To write EOF, use a fixed encoding block. 10 bits instead of 5 bytes.
+	if length == 0 && isEof {
+		w.writeFixedHeader(isEof)
+		// EOB: 7 bits, value: 0
+		w.writeBits(0, 7)
+		w.flush()
+		return
+	}
+
 	var flag int32
 	if isEof {
 		flag = 1
@ -591,8 +612,8 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 		tokens.AddEOB()
 	}

-	// We cannot reuse pure huffman table.
-	if w.lastHuffMan && w.lastHeader > 0 {
+	// We cannot reuse pure huffman table, and must mark as EOF.
+	if (w.lastHuffMan || eof) && w.lastHeader > 0 {
 		// We will not try to reuse.
 		w.writeCode(w.literalEncoding.codes[endBlockMarker])
 		w.lastHeader = 0
@ -606,14 +627,14 @@ func (w *huffmanBitWriter) writeBlockDynamic(tokens *tokens, eof bool, input []b
 	var size int
 	// Check if we should reuse.
 	if w.lastHeader > 0 {
-		// Estimate size for using a new table
+		// Estimate size for using a new table.
+		// Use the previous header size as the best estimate.
 		newSize := w.lastHeader + tokens.EstimatedBits()
+		newSize += newSize >> w.logNewTablePenalty

 		// The estimated size is calculated as an optimal table.
 		// We add a penalty to make it more realistic and re-use a bit more.
-		newSize += newSize >> (w.logReusePenalty & 31)
-		extra := w.extraBitSize()
-		reuseSize, _ := w.dynamicSize(w.literalEncoding, w.offsetEncoding, extra)
+		reuseSize := w.dynamicReuseSize(w.literalEncoding, w.offsetEncoding) + w.extraBitSize()

 		// Check if a new table is better.
 		if newSize < reuseSize {
@ -805,21 +826,30 @@ func (w *huffmanBitWriter) writeBlockHuff(eof bool, input []byte, sync bool) {
 	}

 	// Add everything as literals
-	estBits := histogramSize(input, w.literalFreq[:], !eof && !sync) + 15
+	// We have to estimate the header size.
+	// Assume header is around 70 bytes:
+	// https://stackoverflow.com/a/25454430
+	const guessHeaderSizeBits = 70 * 8
+	estBits, estExtra := histogramSize(input, w.literalFreq[:], !eof && !sync)
+	estBits += w.lastHeader + 15
+	if w.lastHeader == 0 {
+		estBits += guessHeaderSizeBits
+	}
+	estBits += estBits >> w.logNewTablePenalty

 	// Store bytes, if we don't get a reasonable improvement.
 	ssize, storable := w.storedSize(input)
-	if storable && ssize < (estBits+estBits>>4) {
+	if storable && ssize < estBits {
 		w.writeStoredHeader(len(input), eof)
 		w.writeBytes(input)
 		return
 	}

 	if w.lastHeader > 0 {
-		size, _ := w.dynamicSize(w.literalEncoding, huffOffset, w.lastHeader)
-		estBits += estBits >> (w.logReusePenalty)
+		reuseSize := w.literalEncoding.bitLength(w.literalFreq[:256])
+		estBits += estExtra

-		if estBits < size {
+		if estBits < reuseSize {
 			// We owe an EOB
 			w.writeCode(w.literalEncoding.codes[endBlockMarker])
 			w.lastHeader = 0
--- a/vendor/github.com/klauspost/compress/flate/huffman_code.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_code.go
@ -7,7 +7,6 @@ package flate
 import (
 	"math"
 	"math/bits"
-	"sort"
 )

 const (
@ -25,8 +24,6 @@ type huffmanEncoder struct {
 	codes     []hcode
 	freqcache []literalNode
 	bitCount  [17]int32
-	lns       byLiteral // stored to avoid repeated allocation in generate
-	lfs       byFreq    // stored to avoid repeated allocation in generate
 }

 type literalNode struct {
@ -85,17 +82,14 @@ func generateFixedLiteralEncoding() *huffmanEncoder {
 			// size 8, 000110000  .. 10111111
 			bits = ch + 48
 			size = 8
-			break
 		case ch < 256:
 			// size 9, 110010000 .. 111111111
 			bits = ch + 400 - 144
 			size = 9
-			break
 		case ch < 280:
 			// size 7, 0000000 .. 0010111
 			bits = ch - 256
 			size = 7
-			break
 		default:
 			// size 8, 11000000 .. 11000111
 			bits = ch + 192 - 280
@ -115,8 +109,8 @@ func generateFixedOffsetEncoding() *huffmanEncoder {
 	return h
 }

-var fixedLiteralEncoding *huffmanEncoder = generateFixedLiteralEncoding()
-var fixedOffsetEncoding *huffmanEncoder = generateFixedOffsetEncoding()
+var fixedLiteralEncoding = generateFixedLiteralEncoding()
+var fixedOffsetEncoding = generateFixedOffsetEncoding()

 func (h *huffmanEncoder) bitLength(freq []uint16) int {
 	var total int
@ -273,7 +267,7 @@ func (h *huffmanEncoder) assignEncodingAndSize(bitCount []int32, list []literalN
 		// assigned in literal order (not frequency order).
 		chunk := list[len(list)-int(bits):]

-		h.lns.sort(chunk)
+		sortByLiteral(chunk)
 		for _, node := range chunk {
 			h.codes[node.literal] = hcode{code: reverseBits(code, uint8(n)), len: uint16(n)}
 			code++
@ -318,7 +312,7 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 		}
 		return
 	}
-	h.lfs.sort(list)
+	sortByFreq(list)

 	// Get the number of literals for each bit count
 	bitCount := h.bitCounts(list, maxBits)
@ -326,59 +320,44 @@ func (h *huffmanEncoder) generate(freq []uint16, maxBits int32) {
 	h.assignEncodingAndSize(bitCount, list)
 }

-type byLiteral []literalNode
-
-func (s *byLiteral) sort(a []literalNode) {
-	*s = byLiteral(a)
-	sort.Sort(s)
-}
-
-func (s byLiteral) Len() int { return len(s) }
-
-func (s byLiteral) Less(i, j int) bool {
-	return s[i].literal < s[j].literal
-}
-
-func (s byLiteral) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
-type byFreq []literalNode
-
-func (s *byFreq) sort(a []literalNode) {
-	*s = byFreq(a)
-	sort.Sort(s)
-}
-
-func (s byFreq) Len() int { return len(s) }
-
-func (s byFreq) Less(i, j int) bool {
-	if s[i].freq == s[j].freq {
-		return s[i].literal < s[j].literal
+func atLeastOne(v float32) float32 {
+	if v < 1 {
+		return 1
 	}
-	return s[i].freq < s[j].freq
+	return v
 }

-func (s byFreq) Swap(i, j int) { s[i], s[j] = s[j], s[i] }
-
 // histogramSize accumulates a histogram of b in h.
 // An estimated size in bits is returned.
 // Unassigned values are assigned '1' in the histogram.
 // len(h) must be >= 256, and h's elements must be all zeroes.
-func histogramSize(b []byte, h []uint16, fill bool) int {
+func histogramSize(b []byte, h []uint16, fill bool) (int, int) {
 	h = h[:256]
 	for _, t := range b {
 		h[t]++
 	}
-	invTotal := 1.0 / float64(len(b))
-	shannon := 0.0
-	single := math.Ceil(-math.Log2(invTotal))
-	for i, v := range h[:] {
-		if v > 0 {
-			n := float64(v)
-			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
-		} else if fill {
-			shannon += single
-			h[i] = 1
+	invTotal := 1.0 / float32(len(b))
+	shannon := float32(0.0)
+	var extra float32
+	if fill {
+		oneBits := atLeastOne(-mFastLog2(invTotal))
+		for i, v := range h[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			} else {
+				h[i] = 1
+				extra += oneBits
+			}
+		}
+	} else {
+		for _, v := range h[:] {
+			if v > 0 {
+				n := float32(v)
+				shannon += atLeastOne(-mFastLog2(n*invTotal)) * n
+			}
 		}
 	}
-	return int(shannon + 0.99)
+
+	return int(shannon + 0.99), int(extra + 0.99)
 }
--- a/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByFreq.go
@ -0,0 +1,178 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByFreq(data []literalNode) {
+	n := len(data)
+	quickSortByFreq(data, 0, n, maxDepth(n))
+}
+
+func quickSortByFreq(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivotByFreq(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSortByFreq(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSortByFreq(data, mhi, b)
+		} else {
+			quickSortByFreq(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSortByFreq(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].freq == data[i-6].freq && data[i].literal < data[i-6].literal || data[i].freq < data[i-6].freq {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSortByFreq(data, a, b)
+	}
+}
+
+// siftDownByFreq implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDownByFreq(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && (data[first+child].freq == data[first+child+1].freq && data[first+child].literal < data[first+child+1].literal || data[first+child].freq < data[first+child+1].freq) {
+			child++
+		}
+		if data[first+root].freq == data[first+child].freq && data[first+root].literal > data[first+child].literal || data[first+root].freq > data[first+child].freq {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivotByFreq(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThreeSortByFreq(data, lo, lo+s, lo+2*s)
+		medianOfThreeSortByFreq(data, m, m-s, m+s)
+		medianOfThreeSortByFreq(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThreeSortByFreq(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ {
+	}
+	b := a
+	for {
+		for ; b < c && (data[pivot].freq == data[b].freq && data[pivot].literal > data[b].literal || data[pivot].freq > data[b].freq); b++ { // data[b] <= pivot
+		}
+		for ; b < c && (data[pivot].freq == data[c-1].freq && data[pivot].literal < data[c-1].literal || data[pivot].freq < data[c-1].freq); c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].freq == data[hi-1].freq && data[pivot].literal > data[hi-1].literal || data[pivot].freq > data[hi-1].freq { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].freq == data[pivot].freq && data[m].literal > data[pivot].literal || data[m].freq > data[pivot].freq { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && (data[b-1].freq == data[pivot].freq && data[b-1].literal > data[pivot].literal || data[b-1].freq > data[pivot].freq); b-- { // data[b] == pivot
+			}
+			for ; a < b && (data[a].freq == data[pivot].freq && data[a].literal < data[pivot].literal || data[a].freq < data[pivot].freq); a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSortByFreq(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && (data[j].freq == data[j-1].freq && data[j].literal < data[j-1].literal || data[j].freq < data[j-1].freq); j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// quickSortByFreq, loosely following Bentley and McIlroy,
+// ``Engineering a Sort Function,'' SP&E November 1993.
+
+// medianOfThreeSortByFreq moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThreeSortByFreq(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].freq == data[m1].freq && data[m2].literal < data[m1].literal || data[m2].freq < data[m1].freq {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].freq == data[m0].freq && data[m1].literal < data[m0].literal || data[m1].freq < data[m0].freq {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
--- a/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
+++ b/vendor/github.com/klauspost/compress/flate/huffman_sortByLiteral.go
@ -0,0 +1,201 @@
+// Copyright 2009 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+package flate
+
+// Sort sorts data.
+// It makes one call to data.Len to determine n, and O(n*log(n)) calls to
+// data.Less and data.Swap. The sort is not guaranteed to be stable.
+func sortByLiteral(data []literalNode) {
+	n := len(data)
+	quickSort(data, 0, n, maxDepth(n))
+}
+
+func quickSort(data []literalNode, a, b, maxDepth int) {
+	for b-a > 12 { // Use ShellSort for slices <= 12 elements
+		if maxDepth == 0 {
+			heapSort(data, a, b)
+			return
+		}
+		maxDepth--
+		mlo, mhi := doPivot(data, a, b)
+		// Avoiding recursion on the larger subproblem guarantees
+		// a stack depth of at most lg(b-a).
+		if mlo-a < b-mhi {
+			quickSort(data, a, mlo, maxDepth)
+			a = mhi // i.e., quickSort(data, mhi, b)
+		} else {
+			quickSort(data, mhi, b, maxDepth)
+			b = mlo // i.e., quickSort(data, a, mlo)
+		}
+	}
+	if b-a > 1 {
+		// Do ShellSort pass with gap 6
+		// It could be written in this simplified form cause b-a <= 12
+		for i := a + 6; i < b; i++ {
+			if data[i].literal < data[i-6].literal {
+				data[i], data[i-6] = data[i-6], data[i]
+			}
+		}
+		insertionSort(data, a, b)
+	}
+}
+func heapSort(data []literalNode, a, b int) {
+	first := a
+	lo := 0
+	hi := b - a
+
+	// Build heap with greatest element at top.
+	for i := (hi - 1) / 2; i >= 0; i-- {
+		siftDown(data, i, hi, first)
+	}
+
+	// Pop elements, largest first, into end of data.
+	for i := hi - 1; i >= 0; i-- {
+		data[first], data[first+i] = data[first+i], data[first]
+		siftDown(data, lo, i, first)
+	}
+}
+
+// siftDown implements the heap property on data[lo, hi).
+// first is an offset into the array where the root of the heap lies.
+func siftDown(data []literalNode, lo, hi, first int) {
+	root := lo
+	for {
+		child := 2*root + 1
+		if child >= hi {
+			break
+		}
+		if child+1 < hi && data[first+child].literal < data[first+child+1].literal {
+			child++
+		}
+		if data[first+root].literal > data[first+child].literal {
+			return
+		}
+		data[first+root], data[first+child] = data[first+child], data[first+root]
+		root = child
+	}
+}
+func doPivot(data []literalNode, lo, hi int) (midlo, midhi int) {
+	m := int(uint(lo+hi) >> 1) // Written like this to avoid integer overflow.
+	if hi-lo > 40 {
+		// Tukey's ``Ninther,'' median of three medians of three.
+		s := (hi - lo) / 8
+		medianOfThree(data, lo, lo+s, lo+2*s)
+		medianOfThree(data, m, m-s, m+s)
+		medianOfThree(data, hi-1, hi-1-s, hi-1-2*s)
+	}
+	medianOfThree(data, lo, m, hi-1)
+
+	// Invariants are:
+	//	data[lo] = pivot (set up by ChoosePivot)
+	//	data[lo < i < a] < pivot
+	//	data[a <= i < b] <= pivot
+	//	data[b <= i < c] unexamined
+	//	data[c <= i < hi-1] > pivot
+	//	data[hi-1] >= pivot
+	pivot := lo
+	a, c := lo+1, hi-1
+
+	for ; a < c && data[a].literal < data[pivot].literal; a++ {
+	}
+	b := a
+	for {
+		for ; b < c && data[pivot].literal > data[b].literal; b++ { // data[b] <= pivot
+		}
+		for ; b < c && data[pivot].literal < data[c-1].literal; c-- { // data[c-1] > pivot
+		}
+		if b >= c {
+			break
+		}
+		// data[b] > pivot; data[c-1] <= pivot
+		data[b], data[c-1] = data[c-1], data[b]
+		b++
+		c--
+	}
+	// If hi-c<3 then there are duplicates (by property of median of nine).
+	// Let's be a bit more conservative, and set border to 5.
+	protect := hi-c < 5
+	if !protect && hi-c < (hi-lo)/4 {
+		// Lets test some points for equality to pivot
+		dups := 0
+		if data[pivot].literal > data[hi-1].literal { // data[hi-1] = pivot
+			data[c], data[hi-1] = data[hi-1], data[c]
+			c++
+			dups++
+		}
+		if data[b-1].literal > data[pivot].literal { // data[b-1] = pivot
+			b--
+			dups++
+		}
+		// m-lo = (hi-lo)/2 > 6
+		// b-lo > (hi-lo)*3/4-1 > 8
+		// ==> m < b ==> data[m] <= pivot
+		if data[m].literal > data[pivot].literal { // data[m] = pivot
+			data[m], data[b-1] = data[b-1], data[m]
+			b--
+			dups++
+		}
+		// if at least 2 points are equal to pivot, assume skewed distribution
+		protect = dups > 1
+	}
+	if protect {
+		// Protect against a lot of duplicates
+		// Add invariant:
+		//	data[a <= i < b] unexamined
+		//	data[b <= i < c] = pivot
+		for {
+			for ; a < b && data[b-1].literal > data[pivot].literal; b-- { // data[b] == pivot
+			}
+			for ; a < b && data[a].literal < data[pivot].literal; a++ { // data[a] < pivot
+			}
+			if a >= b {
+				break
+			}
+			// data[a] == pivot; data[b-1] < pivot
+			data[a], data[b-1] = data[b-1], data[a]
+			a++
+			b--
+		}
+	}
+	// Swap pivot into middle
+	data[pivot], data[b-1] = data[b-1], data[pivot]
+	return b - 1, c
+}
+
+// Insertion sort
+func insertionSort(data []literalNode, a, b int) {
+	for i := a + 1; i < b; i++ {
+		for j := i; j > a && data[j].literal < data[j-1].literal; j-- {
+			data[j], data[j-1] = data[j-1], data[j]
+		}
+	}
+}
+
+// maxDepth returns a threshold at which quicksort should switch
+// to heapsort. It returns 2*ceil(lg(n+1)).
+func maxDepth(n int) int {
+	var depth int
+	for i := n; i > 0; i >>= 1 {
+		depth++
+	}
+	return depth * 2
+}
+
+// medianOfThree moves the median of the three values data[m0], data[m1], data[m2] into data[m1].
+func medianOfThree(data []literalNode, m1, m0, m2 int) {
+	// sort 3 elements
+	if data[m1].literal < data[m0].literal {
+		data[m1], data[m0] = data[m0], data[m1]
+	}
+	// data[m0] <= data[m1]
+	if data[m2].literal < data[m1].literal {
+		data[m2], data[m1] = data[m1], data[m2]
+		// data[m0] <= data[m2] && data[m1] < data[m2]
+		if data[m1].literal < data[m0].literal {
+			data[m1], data[m0] = data[m0], data[m1]
+		}
+	}
+	// now data[m0] <= data[m1] <= data[m2]
+}
--- a/vendor/github.com/klauspost/compress/flate/inflate.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate.go
@ -106,7 +106,7 @@ const (
 )

 type huffmanDecoder struct {
-	min      int                       // the minimum code length
+	maxRead  int                       // the maximum number of bits we can read and not overread
 	chunks   *[huffmanNumChunks]uint16 // chunks as described above
 	links    [][]uint16                // overflow links
 	linkMask uint32                    // mask the width of the link table
@ -126,12 +126,12 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	if h.chunks == nil {
 		h.chunks = &[huffmanNumChunks]uint16{}
 	}
-	if h.min != 0 {
+	if h.maxRead != 0 {
 		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}

 	// Count number of codes of each length,
-	// compute min and max length.
+	// compute maxRead and max length.
 	var count [maxCodeLen]int
 	var min, max int
 	for _, n := range lengths {
@ -178,7 +178,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 		return false
 	}

-	h.min = min
+	h.maxRead = min
 	chunks := h.chunks[:]
 	for i := range chunks {
 		chunks[i] = 0
@ -342,7 +342,7 @@ func (f *decompressor) nextBlock() {
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@ -350,7 +350,7 @@ func (f *decompressor) nextBlock() {
 		}
 		f.hl = &f.h1
 		f.hd = &f.h2
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	default:
 		// 3 is reserved.
 		if debugDecode {
@ -543,12 +543,18 @@ func (f *decompressor) readHuffman() error {
 		return CorruptInputError(f.roffset)
 	}

-	// As an optimization, we can initialize the min bits to read at a time
+	// As an optimization, we can initialize the maxRead bits to read at a time
 	// for the HLIT tree to the length of the EOB marker since we know that
 	// every block must terminate with one. This preserves the property that
 	// we never read any extra bytes after the end of the DEFLATE stream.
-	if f.h1.min < f.bits[endBlockMarker] {
-		f.h1.min = f.bits[endBlockMarker]
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
 	}

 	return nil
@ -558,7 +564,7 @@ func (f *decompressor) readHuffman() error {
 // hl and hd are the Huffman states for the lit/length values
 // and the distance values, respectively. If hd == nil, using the
 // fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlock() {
+func (f *decompressor) huffmanBlockGeneric() {
 	const (
 		stateInit = iota // Zero value must be stateInit
 		stateDict
@ -574,19 +580,64 @@ func (f *decompressor) huffmanBlock() {
 readLiteral:
 	// Read literal and/or (length, distance) according to RFC section 3.2.3.
 	{
-		v, err := f.huffSym(f.hl)
-		if err != nil {
-			f.err = err
-			return
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := f.r.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
 		}
+
 		var n uint // number of bits extra
 		var length int
+		var err error
 		switch {
 		case v < 256:
 			f.dict.writeByte(byte(v))
 			if f.dict.availWrite() == 0 {
 				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlock
+				f.step = (*decompressor).huffmanBlockGeneric
 				f.stepState = stateInit
 				return
 			}
@ -714,7 +765,7 @@ copyHistory:

 		if f.dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlock // We need to continue this work
+			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
 			f.stepState = stateDict
 			return
 		}
@ -726,21 +777,33 @@ copyHistory:
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
 	// Discard current half-byte.
-	f.nb = 0
-	f.b = 0
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0

 	// Length then ones-complement of length.
-	nr, err := io.ReadFull(f.r, f.buf[0:4])
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
 	f.roffset += int64(nr)
 	if err != nil {
 		f.err = noEOF(err)
 		return
 	}
-	n := int(f.buf[0]) | int(f.buf[1])<<8
-	nn := int(f.buf[2]) | int(f.buf[3])<<8
-	if uint16(nn) != uint16(^n) {
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
 		if debugDecode {
-			fmt.Println("uint16(nn) != uint16(^n)", nn, ^n)
+			ncomp := ^n
+			fmt.Println("uint16(nn) != uint16(^n)", nn, ncomp)
 		}
 		f.err = CorruptInputError(f.roffset)
 		return
@ -752,7 +815,7 @@ func (f *decompressor) dataBlock() {
 		return
 	}

-	f.copyLen = n
+	f.copyLen = int(n)
 	f.copyData()
 }

@ -816,7 +879,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 	// with single element, huffSym must error on these two edge cases. In both
 	// cases, the chunks slice will be 0 for the invalid sequence, leading it
 	// satisfy the n == 0 check below.
-	n := uint(h.min)
+	n := uint(h.maxRead)
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
--- a/vendor/github.com/klauspost/compress/flate/inflate_gen.go
+++ b/vendor/github.com/klauspost/compress/flate/inflate_gen.go
@ -0,0 +1,922 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"fmt"
+	"math/bits"
+	"strings"
+)
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesBuffer
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBufioReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// Decode a single Huffman block from f.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						if debugDecode {
+							fmt.Println("huffsym: n==0")
+						}
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			if debugDecode {
+				fmt.Println(v, ">= maxNumLit")
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits n>0:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<5:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				if debugDecode {
+					fmt.Println("huffsym:", err)
+				}
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					if debugDecode {
+						fmt.Println("morebits f.nb<nb:", err)
+					}
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			if debugDecode {
+				fmt.Println("dist too big:", dist, maxNumDist)
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			if debugDecode {
+				fmt.Println("dist > f.dict.histSize():", dist, f.dict.histSize())
+			}
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+func (f *decompressor) huffmanBlockDecoder() func() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		return f.huffmanBytesBuffer
+	case *bytes.Reader:
+		return f.huffmanBytesReader
+	case *bufio.Reader:
+		return f.huffmanBufioReader
+	case *strings.Reader:
+		return f.huffmanStringsReader
+	default:
+		return f.huffmanBlockGeneric
+	}
+}
--- a/vendor/github.com/klauspost/compress/flate/level1.go
+++ b/vendor/github.com/klauspost/compress/flate/level1.go
@ -1,5 +1,7 @@
 package flate

+import "fmt"
+
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
@ -14,6 +16,9 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}

 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
@ -76,12 +81,12 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			}

 			now := load6432(src, nextS)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hash(uint32(now))

 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}

@ -91,11 +96,11 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}

 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}
 			cv = uint32(now)
@ -134,7 +139,7 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
-					e.table[hash(cv)] = tableEntry{offset: s + e.cur, val: cv}
+					e.table[hash(cv)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@ -148,14 +153,14 @@ func (e *fastEncL1) Encode(dst *tokens, src []byte) {
 			x := load6432(src, s-2)
 			o := e.cur + s - 2
 			prevHash := hash(uint32(x))
-			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
+			e.table[prevHash] = tableEntry{offset: o}
 			x >>= 16
 			currHash := hash(uint32(x))
 			candidate = e.table[currHash]
-			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x)}
+			e.table[currHash] = tableEntry{offset: o + 2}

 			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x) != candidate.val {
+			if offset > maxMatchOffset || uint32(x) != load3232(src, candidate.offset-e.cur) {
 				cv = uint32(x >> 8)
 				s++
 				break
--- a/vendor/github.com/klauspost/compress/flate/level2.go
+++ b/vendor/github.com/klauspost/compress/flate/level2.go
@ -1,5 +1,7 @@
 package flate

+import "fmt"
+
 // fastGen maintains the table for matches,
 // and the previous byte block for level 2.
 // This is the generic implementation.
@ -16,6 +18,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)

+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
@ -77,12 +83,12 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			}
 			candidate = e.table[nextHash]
 			now := load6432(src, nextS)
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}
 			nextHash = hash4u(uint32(now), bTableBits)

 			offset := s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
-				e.table[nextHash] = tableEntry{offset: nextS + e.cur, val: uint32(now)}
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
+				e.table[nextHash] = tableEntry{offset: nextS + e.cur}
 				break
 			}

@ -92,10 +98,10 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			nextS++
 			candidate = e.table[nextHash]
 			now >>= 8
-			e.table[nextHash] = tableEntry{offset: s + e.cur, val: cv}
+			e.table[nextHash] = tableEntry{offset: s + e.cur}

 			offset = s - (candidate.offset - e.cur)
-			if offset < maxMatchOffset && cv == candidate.val {
+			if offset < maxMatchOffset && cv == load3232(src, candidate.offset-e.cur) {
 				break
 			}
 			cv = uint32(now)
@ -142,7 +148,7 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 				// Index first pair after match end.
 				if int(s+l+4) < len(src) {
 					cv := load3232(src, s)
-					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur, val: cv}
+					e.table[hash4u(cv, bTableBits)] = tableEntry{offset: s + e.cur}
 				}
 				goto emitRemainder
 			}
@ -151,15 +157,15 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			for i := s - l + 2; i < s-5; i += 7 {
 				x := load6432(src, int32(i))
 				nextHash := hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i + 2, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 2}
 				// Skip one
 				x >>= 16
 				nextHash = hash4u(uint32(x), bTableBits)
-				e.table[nextHash] = tableEntry{offset: e.cur + i + 4, val: uint32(x)}
+				e.table[nextHash] = tableEntry{offset: e.cur + i + 4}
 			}

 			// We could immediately start working at s now, but to improve
@ -172,14 +178,14 @@ func (e *fastEncL2) Encode(dst *tokens, src []byte) {
 			o := e.cur + s - 2
 			prevHash := hash4u(uint32(x), bTableBits)
 			prevHash2 := hash4u(uint32(x>>8), bTableBits)
-			e.table[prevHash] = tableEntry{offset: o, val: uint32(x)}
-			e.table[prevHash2] = tableEntry{offset: o + 1, val: uint32(x >> 8)}
+			e.table[prevHash] = tableEntry{offset: o}
+			e.table[prevHash2] = tableEntry{offset: o + 1}
 			currHash := hash4u(uint32(x>>16), bTableBits)
 			candidate = e.table[currHash]
-			e.table[currHash] = tableEntry{offset: o + 2, val: uint32(x >> 16)}
+			e.table[currHash] = tableEntry{offset: o + 2}

 			offset := s - (candidate.offset - e.cur)
-			if offset > maxMatchOffset || uint32(x>>16) != candidate.val {
+			if offset > maxMatchOffset || uint32(x>>16) != load3232(src, candidate.offset-e.cur) {
 				cv = uint32(x >> 24)
 				s++
 				break
--- a/vendor/github.com/klauspost/compress/flate/level3.go
+++ b/vendor/github.com/klauspost/compress/flate/level3.go
@ -1,5 +1,7 @@
 package flate

+import "fmt"
+
 // fastEncL3
 type fastEncL3 struct {
 	fastGen
@ -13,6 +15,10 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)

+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
+
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
@ -75,22 +81,26 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			}
 			candidates := e.table[nextHash]
 			now := load3232(src, nextS)
-			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur, val: cv}}
+
+			// Safe offset distance until s + 4...
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+			e.table[nextHash] = tableEntryPrev{Prev: candidates.Cur, Cur: tableEntry{offset: s + e.cur}}

 			// Check both candidates
 			candidate = candidates.Cur
-			offset := s - (candidate.offset - e.cur)
-			if cv == candidate.val {
-				if offset > maxMatchOffset {
-					cv = now
-					// Previous will also be invalid, we have nothing.
-					continue
-				}
-				o2 := s - (candidates.Prev.offset - e.cur)
-				if cv != candidates.Prev.val || o2 > maxMatchOffset {
+			if candidate.offset < minOffset {
+				cv = now
+				// Previous will also be invalid, we have nothing.
+				continue
+			}
+
+			if cv == load3232(src, candidate.offset-e.cur) {
+				if candidates.Prev.offset < minOffset || cv != load3232(src, candidates.Prev.offset-e.cur) {
 					break
 				}
 				// Both match and are valid, pick longest.
+				offset := s - (candidate.offset - e.cur)
+				o2 := s - (candidates.Prev.offset - e.cur)
 				l1, l2 := matchLen(src[s+4:], src[s-offset+4:]), matchLen(src[s+4:], src[s-o2+4:])
 				if l2 > l1 {
 					candidate = candidates.Prev
@ -100,11 +110,8 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if cv == candidate.val {
-					offset := s - (candidate.offset - e.cur)
-					if offset <= maxMatchOffset {
-						break
-					}
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
+					break
 				}
 			}
 			cv = now
@ -152,7 +159,7 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 					nextHash := hash(cv)
 					e.table[nextHash] = tableEntryPrev{
 						Prev: e.table[nextHash].Cur,
-						Cur:  tableEntry{offset: e.cur + t, val: cv},
+						Cur:  tableEntry{offset: e.cur + t},
 					}
 				}
 				goto emitRemainder
@ -164,21 +171,21 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			prevHash := hash(uint32(x))
 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 3, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 3},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))

 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 2, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 2},
 			}
 			x >>= 8
 			prevHash = hash(uint32(x))

 			e.table[prevHash] = tableEntryPrev{
 				Prev: e.table[prevHash].Cur,
-				Cur:  tableEntry{offset: e.cur + s - 1, val: uint32(x)},
+				Cur:  tableEntry{offset: e.cur + s - 1},
 			}
 			x >>= 8
 			currHash := hash(uint32(x))
@ -186,21 +193,18 @@ func (e *fastEncL3) Encode(dst *tokens, src []byte) {
 			cv = uint32(x)
 			e.table[currHash] = tableEntryPrev{
 				Prev: candidates.Cur,
-				Cur:  tableEntry{offset: s + e.cur, val: cv},
+				Cur:  tableEntry{offset: s + e.cur},
 			}

 			// Check both candidates
 			candidate = candidates.Cur
-			if cv == candidate.val {
-				offset := s - (candidate.offset - e.cur)
-				if offset <= maxMatchOffset {
-					continue
-				}
-			} else {
+			minOffset := e.cur + s - (maxMatchOffset - 4)
+
+			if candidate.offset > minOffset && cv != load3232(src, candidate.offset-e.cur) {
 				// We only check if value mismatches.
 				// Offset will always be invalid in other cases.
 				candidate = candidates.Prev
-				if cv == candidate.val {
+				if candidate.offset > minOffset && cv == load3232(src, candidate.offset-e.cur) {
 					offset := s - (candidate.offset - e.cur)
 					if offset <= maxMatchOffset {
 						continue
--- a/vendor/github.com/klauspost/compress/flate/level4.go
+++ b/vendor/github.com/klauspost/compress/flate/level4.go
@ -13,7 +13,9 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
-
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}
 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
 		if len(e.hist) == 0 {
@ -90,24 +92,24 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			e.bTable[nextHashL] = entry

 			t = lCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == lCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.offset-e.cur) {
 				// We got a long match. Use that.
 				break
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				lCandidate = e.bTable[hash7(next, tableBits)]

 				// If the next long is a candidate, check if we should use that instead...
 				lOff := nextS - (lCandidate.offset - e.cur)
-				if lOff < maxMatchOffset && lCandidate.val == uint32(next) {
+				if lOff < maxMatchOffset && load3232(src, lCandidate.offset-e.cur) == uint32(next) {
 					l1, l2 := matchLen(src[s+4:], src[t+4:]), matchLen(src[nextS+4:], src[nextS-lOff+4:])
 					if l2 > l1 {
 						s = nextS
@ -135,7 +137,7 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
-		if false {
+		if debugDeflate {
 			if t >= s {
 				panic("s-t")
 			}
@ -158,8 +160,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			// Index first pair after match end.
 			if int(s+8) < len(src) {
 				cv := load6432(src, s)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
-				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: s + e.cur}
+				e.bTable[hash7(cv, tableBits)] = tableEntry{offset: s + e.cur}
 			}
 			goto emitRemainder
 		}
@ -169,20 +171,20 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 			i := nextS
 			if i < s-1 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-				t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
 				e.bTable[hash7(cv, tableBits)] = t
 				e.bTable[hash7(cv>>8, tableBits)] = t2
-				e.table[hash4u(t2.val, tableBits)] = t2
+				e.table[hash4u(uint32(cv>>8), tableBits)] = t2

 				i += 3
 				for ; i < s-1; i += 3 {
 					cv := load6432(src, i)
-					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-					t2 := tableEntry{val: uint32(cv >> 8), offset: t.offset + 1}
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
 					e.bTable[hash7(cv, tableBits)] = t
 					e.bTable[hash7(cv>>8, tableBits)] = t2
-					e.table[hash4u(t2.val, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
 				}
 			}
 		}
@ -193,8 +195,8 @@ func (e *fastEncL4) Encode(dst *tokens, src []byte) {
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
-		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
-		e.bTable[prevHashL] = tableEntry{offset: o, val: uint32(x)}
+		e.table[prevHashS] = tableEntry{offset: o}
+		e.bTable[prevHashL] = tableEntry{offset: o}
 		cv = x >> 8
 	}

--- a/vendor/github.com/klauspost/compress/flate/level5.go
+++ b/vendor/github.com/klauspost/compress/flate/level5.go
@ -13,6 +13,9 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}

 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
@ -97,7 +100,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
@ -107,14 +110,14 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {

 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == lCandidate.Cur.val {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur

 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@ -126,30 +129,30 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					break
 				}
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 					break
 				}
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4
 				lCandidate = e.bTable[nextHashL]
 				// Store the next match

-				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 				eLong := &e.bTable[nextHashL]
-				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur

 				// If the next long is a candidate, use that...
 				t2 := lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if lCandidate.Cur.val == uint32(next) {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@ -160,7 +163,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@ -194,7 +197,7 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		if nextEmit < s {
 			emitLiteral(dst, src[nextEmit:s])
 		}
-		if false {
+		if debugDeflate {
 			if t >= s {
 				panic(fmt.Sprintln("s-t", s, t))
 			}
@ -223,31 +226,31 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 			i := s - l + 1
 			if i < s-1 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
+				t := tableEntry{offset: i + e.cur}
 				e.table[hash4x64(cv, tableBits)] = t
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur

 				// Do an long at i+1
 				cv >>= 8
-				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				t = tableEntry{offset: t.offset + 1}
 				eLong = &e.bTable[hash7(cv, tableBits)]
 				eLong.Cur, eLong.Prev = t, eLong.Cur

 				// We only have enough bits for a short entry at i+2
 				cv >>= 8
-				t = tableEntry{offset: t.offset + 1, val: uint32(cv)}
+				t = tableEntry{offset: t.offset + 1}
 				e.table[hash4x64(cv, tableBits)] = t

 				// Skip one - otherwise we risk hitting 's'
 				i += 4
 				for ; i < s-1; i += hashEvery {
 					cv := load6432(src, i)
-					t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-					t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+					t := tableEntry{offset: i + e.cur}
+					t2 := tableEntry{offset: t.offset + 1}
 					eLong := &e.bTable[hash7(cv, tableBits)]
 					eLong.Cur, eLong.Prev = t, eLong.Cur
-					e.table[hash4u(t2.val, tableBits)] = t2
+					e.table[hash4u(uint32(cv>>8), tableBits)] = t2
 				}
 			}
 		}
@ -258,9 +261,9 @@ func (e *fastEncL5) Encode(dst *tokens, src []byte) {
 		o := e.cur + s - 1
 		prevHashS := hash4x64(x, tableBits)
 		prevHashL := hash7(x, tableBits)
-		e.table[prevHashS] = tableEntry{offset: o, val: uint32(x)}
+		e.table[prevHashS] = tableEntry{offset: o}
 		eLong := &e.bTable[prevHashL]
-		eLong.Cur, eLong.Prev = tableEntry{offset: o, val: uint32(x)}, eLong.Cur
+		eLong.Cur, eLong.Prev = tableEntry{offset: o}, eLong.Cur
 		cv = x >> 8
 	}

--- a/vendor/github.com/klauspost/compress/flate/level6.go
+++ b/vendor/github.com/klauspost/compress/flate/level6.go
@ -13,6 +13,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
 	)
+	if debugDeflate && e.cur < 0 {
+		panic(fmt.Sprint("e.cur < 0: ", e.cur))
+	}

 	// Protect against e.cur wraparound.
 	for e.cur >= bufferReset {
@ -98,7 +101,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			sCandidate := e.table[nextHashS]
 			lCandidate := e.bTable[nextHashL]
 			next := load6432(src, nextS)
-			entry := tableEntry{offset: s + e.cur, val: uint32(cv)}
+			entry := tableEntry{offset: s + e.cur}
 			e.table[nextHashS] = entry
 			eLong := &e.bTable[nextHashL]
 			eLong.Cur, eLong.Prev = entry, eLong.Cur
@ -109,17 +112,17 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {

 			t = lCandidate.Cur.offset - e.cur
 			if s-t < maxMatchOffset {
-				if uint32(cv) == lCandidate.Cur.val {
+				if uint32(cv) == load3232(src, lCandidate.Cur.offset-e.cur) {
 					// Long candidate matches at least 4 bytes.

 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur

 					// Check the previous long candidate as well.
 					t2 := lCandidate.Prev.offset - e.cur
-					if s-t2 < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+					if s-t2 < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 						l = e.matchlen(s+4, t+4, src) + 4
 						ml1 := e.matchlen(s+4, t2+4, src) + 4
 						if ml1 > l {
@ -132,17 +135,17 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				}
 				// Current value did not match, but check if previous long value does.
 				t = lCandidate.Prev.offset - e.cur
-				if s-t < maxMatchOffset && uint32(cv) == lCandidate.Prev.val {
+				if s-t < maxMatchOffset && uint32(cv) == load3232(src, lCandidate.Prev.offset-e.cur) {
 					// Store the next match
-					e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+					e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 					eLong := &e.bTable[nextHashL]
-					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+					eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur
 					break
 				}
 			}

 			t = sCandidate.offset - e.cur
-			if s-t < maxMatchOffset && uint32(cv) == sCandidate.val {
+			if s-t < maxMatchOffset && uint32(cv) == load3232(src, sCandidate.offset-e.cur) {
 				// Found a 4 match...
 				l = e.matchlen(s+4, t+4, src) + 4

@ -150,9 +153,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				lCandidate = e.bTable[nextHashL]

 				// Store the next match
-				e.table[nextHashS] = tableEntry{offset: nextS + e.cur, val: uint32(next)}
+				e.table[nextHashS] = tableEntry{offset: nextS + e.cur}
 				eLong := &e.bTable[nextHashL]
-				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur, val: uint32(next)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: nextS + e.cur}, eLong.Cur

 				// Check repeat at s + repOff
 				const repOff = 1
@ -171,7 +174,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 				// If the next long is a candidate, use that...
 				t2 = lCandidate.Cur.offset - e.cur
 				if nextS-t2 < maxMatchOffset {
-					if lCandidate.Cur.val == uint32(next) {
+					if load3232(src, lCandidate.Cur.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@ -182,7 +185,7 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 					}
 					// If the previous long is a candidate, use that...
 					t2 = lCandidate.Prev.offset - e.cur
-					if nextS-t2 < maxMatchOffset && lCandidate.Prev.val == uint32(next) {
+					if nextS-t2 < maxMatchOffset && load3232(src, lCandidate.Prev.offset-e.cur) == uint32(next) {
 						ml := e.matchlen(nextS+4, t2+4, src) + 4
 						if ml > l {
 							t = t2
@ -241,9 +244,9 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 			// Index after match end.
 			for i := nextS + 1; i < int32(len(src))-8; i += 2 {
 				cv := load6432(src, i)
-				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur, val: uint32(cv)}
+				e.table[hash4x64(cv, tableBits)] = tableEntry{offset: i + e.cur}
 				eLong := &e.bTable[hash7(cv, tableBits)]
-				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur, val: uint32(cv)}, eLong.Cur
+				eLong.Cur, eLong.Prev = tableEntry{offset: i + e.cur}, eLong.Cur
 			}
 			goto emitRemainder
 		}
@ -252,8 +255,8 @@ func (e *fastEncL6) Encode(dst *tokens, src []byte) {
 		if true {
 			for i := nextS + 1; i < s-1; i += 2 {
 				cv := load6432(src, i)
-				t := tableEntry{offset: i + e.cur, val: uint32(cv)}
-				t2 := tableEntry{offset: t.offset + 1, val: uint32(cv >> 8)}
+				t := tableEntry{offset: i + e.cur}
+				t2 := tableEntry{offset: t.offset + 1}
 				eLong := &e.bTable[hash7(cv, tableBits)]
 				eLong2 := &e.bTable[hash7(cv>>8, tableBits)]
 				e.table[hash4x64(cv, tableBits)] = t
--- a/vendor/github.com/klauspost/compress/flate/stateless.go
+++ b/vendor/github.com/klauspost/compress/flate/stateless.go
@ -3,10 +3,13 @@ package flate
 import (
 	"io"
 	"math"
+	"sync"
 )

 const (
 	maxStatelessBlock = math.MaxInt16
+	// dictionary will be taken from maxStatelessBlock, so limit it.
+	maxStatelessDict = 8 << 10

 	slTableBits  = 13
 	slTableSize  = 1 << slTableBits
@ -24,11 +27,11 @@ func (s *statelessWriter) Close() error {
 	}
 	s.closed = true
 	// Emit EOF block
-	return StatelessDeflate(s.dst, nil, true)
+	return StatelessDeflate(s.dst, nil, true, nil)
 }

 func (s *statelessWriter) Write(p []byte) (n int, err error) {
-	err = StatelessDeflate(s.dst, p, false)
+	err = StatelessDeflate(s.dst, p, false, nil)
 	if err != nil {
 		return 0, err
 	}
@ -49,11 +52,27 @@ func NewStatelessWriter(dst io.Writer) io.WriteCloser {
 	return &statelessWriter{dst: dst}
 }

+// bitWriterPool contains bit writers that can be reused.
+var bitWriterPool = sync.Pool{
+	New: func() interface{} {
+		return newHuffmanBitWriter(nil)
+	},
+}
+
 // StatelessDeflate allows to compress directly to a Writer without retaining state.
 // When returning everything will be flushed.
-func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
+// Up to 8KB of an optional dictionary can be given which is presumed to presumed to precede the block.
+// Longer dictionaries will be truncated and will still produce valid output.
+// Sending nil dictionary is perfectly fine.
+func StatelessDeflate(out io.Writer, in []byte, eof bool, dict []byte) error {
 	var dst tokens
-	bw := newHuffmanBitWriter(out)
+	bw := bitWriterPool.Get().(*huffmanBitWriter)
+	bw.reset(out)
+	defer func() {
+		// don't keep a reference to our output
+		bw.reset(nil)
+		bitWriterPool.Put(bw)
+	}()
 	if eof && len(in) == 0 {
 		// Just write an EOF block.
 		// Could be faster...
@ -62,35 +81,53 @@ func StatelessDeflate(out io.Writer, in []byte, eof bool) error {
 		return bw.err
 	}

+	// Truncate dict
+	if len(dict) > maxStatelessDict {
+		dict = dict[len(dict)-maxStatelessDict:]
+	}
+
 	for len(in) > 0 {
 		todo := in
-		if len(todo) > maxStatelessBlock {
-			todo = todo[:maxStatelessBlock]
+		if len(todo) > maxStatelessBlock-len(dict) {
+			todo = todo[:maxStatelessBlock-len(dict)]
 		}
 		in = in[len(todo):]
+		uncompressed := todo
+		if len(dict) > 0 {
+			// combine dict and source
+			bufLen := len(todo) + len(dict)
+			combined := make([]byte, bufLen)
+			copy(combined, dict)
+			copy(combined[len(dict):], todo)
+			todo = combined
+		}
 		// Compress
-		statelessEnc(&dst, todo)
+		statelessEnc(&dst, todo, int16(len(dict)))
 		isEof := eof && len(in) == 0

 		if dst.n == 0 {
-			bw.writeStoredHeader(len(todo), isEof)
+			bw.writeStoredHeader(len(uncompressed), isEof)
 			if bw.err != nil {
 				return bw.err
 			}
-			bw.writeBytes(todo)
-		} else if int(dst.n) > len(todo)-len(todo)>>4 {
+			bw.writeBytes(uncompressed)
+		} else if int(dst.n) > len(uncompressed)-len(uncompressed)>>4 {
 			// If we removed less than 1/16th, huffman compress the block.
-			bw.writeBlockHuff(isEof, todo, false)
+			bw.writeBlockHuff(isEof, uncompressed, len(in) == 0)
 		} else {
-			bw.writeBlockDynamic(&dst, isEof, todo, false)
+			bw.writeBlockDynamic(&dst, isEof, uncompressed, len(in) == 0)
+		}
+		if len(in) > 0 {
+			// Retain a dict if we have more
+			dict = todo[len(todo)-maxStatelessDict:]
+			dst.Reset()
 		}
 		if bw.err != nil {
 			return bw.err
 		}
-		dst.Reset()
 	}
 	if !eof {
-		// Align.
+		// Align, only a stored block can do that.
 		bw.writeStoredHeader(0, false)
 	}
 	bw.flush()
@ -116,7 +153,7 @@ func load6416(b []byte, i int16) uint64 {
 		uint64(b[4])<<32 | uint64(b[5])<<40 | uint64(b[6])<<48 | uint64(b[7])<<56
 }

-func statelessEnc(dst *tokens, src []byte) {
+func statelessEnc(dst *tokens, src []byte, startAt int16) {
 	const (
 		inputMargin            = 12 - 1
 		minNonLiteralBlockSize = 1 + 1 + inputMargin
@ -130,15 +167,23 @@ func statelessEnc(dst *tokens, src []byte) {

 	// This check isn't in the Snappy implementation, but there, the caller
 	// instead of the callee handles this case.
-	if len(src) < minNonLiteralBlockSize {
+	if len(src)-int(startAt) < minNonLiteralBlockSize {
 		// We do not fill the token table.
 		// This will be picked up by caller.
-		dst.n = uint16(len(src))
+		dst.n = 0
 		return
 	}
+	// Index until startAt
+	if startAt > 0 {
+		cv := load3232(src, 0)
+		for i := int16(0); i < startAt; i++ {
+			table[hashSL(cv)] = tableEntry{offset: i}
+			cv = (cv >> 8) | (uint32(src[i+4]) << 24)
+		}
+	}

-	s := int16(1)
-	nextEmit := int16(0)
+	s := startAt + 1
+	nextEmit := startAt
 	// sLimit is when to stop looking for offset/length copies. The inputMargin
 	// lets us use a fast path for emitLiteral in the main loop, while we are
 	// looking for copies.
--- a/vendor/github.com/klauspost/compress/flate/token.go
+++ b/vendor/github.com/klauspost/compress/flate/token.go
@ -184,9 +184,7 @@ func (t *tokens) indexTokens(in []token) {
 	t.Reset()
 	for _, tok := range in {
 		if tok < matchType {
-			t.tokens[t.n] = tok
-			t.litHist[tok]++
-			t.n++
+			t.AddLiteral(tok.literal())
 			continue
 		}
 		t.AddMatch(uint32(tok.length()), tok.offset())
@ -211,50 +209,60 @@ func (t *tokens) AddLiteral(lit byte) {
 	t.nLits++
 }

+// from https://stackoverflow.com/a/28730362
+func mFastLog2(val float32) float32 {
+	ux := int32(math.Float32bits(val))
+	log2 := (float32)(((ux >> 23) & 255) - 128)
+	ux &= -0x7f800001
+	ux += 127 << 23
+	uval := math.Float32frombits(uint32(ux))
+	log2 += ((-0.34484843)*uval+2.02466578)*uval - 0.67487759
+	return log2
+}
+
 // EstimatedBits will return an minimum size estimated by an *optimal*
 // compression of the block.
 // The size of the block
 func (t *tokens) EstimatedBits() int {
-	shannon := float64(0)
+	shannon := float32(0)
 	bits := int(0)
 	nMatches := 0
 	if t.nLits > 0 {
-		invTotal := 1.0 / float64(t.nLits)
+		invTotal := 1.0 / float32(t.nLits)
 		for _, v := range t.litHist[:] {
 			if v > 0 {
-				n := float64(v)
-				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
 			}
 		}
 		// Just add 15 for EOB
 		shannon += 15
-		for _, v := range t.extraHist[1 : literalCount-256] {
+		for i, v := range t.extraHist[1 : literalCount-256] {
 			if v > 0 {
-				n := float64(v)
-				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
-				bits += int(lengthExtraBits[v&31]) * int(v)
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
+				bits += int(lengthExtraBits[i&31]) * int(v)
 				nMatches += int(v)
 			}
 		}
 	}
 	if nMatches > 0 {
-		invTotal := 1.0 / float64(nMatches)
-		for _, v := range t.offHist[:offsetCodeCount] {
+		invTotal := 1.0 / float32(nMatches)
+		for i, v := range t.offHist[:offsetCodeCount] {
 			if v > 0 {
-				n := float64(v)
-				shannon += math.Ceil(-math.Log2(n*invTotal) * n)
-				bits += int(offsetExtraBits[v&31]) * int(n)
+				n := float32(v)
+				shannon += -mFastLog2(n*invTotal) * n
+				bits += int(offsetExtraBits[i&31]) * int(v)
 			}
 		}
 	}
-
 	return int(shannon) + bits
 }

 // AddMatch adds a match to the tokens.
 // This function is very sensitive to inlining and right on the border.
 func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
-	if debugDecode {
+	if debugDeflate {
 		if xlength >= maxMatchLength+baseMatchLength {
 			panic(fmt.Errorf("invalid length: %v", xlength))
 		}
@ -273,7 +281,7 @@ func (t *tokens) AddMatch(xlength uint32, xoffset uint32) {
 // AddMatchLong adds a match to the tokens, potentially longer than max match length.
 // Length should NOT have the base subtracted, only offset should.
 func (t *tokens) AddMatchLong(xlength int32, xoffset uint32) {
-	if debugDecode {
+	if debugDeflate {
 		if xoffset >= maxMatchOffset+baseMatchOffset {
 			panic(fmt.Errorf("invalid offset: %v", xoffset))
 		}
--- a/vendor/github.com/klauspost/compress/gzip/gzip.go
+++ b/vendor/github.com/klauspost/compress/gzip/gzip.go
@ -207,7 +207,7 @@ func (z *Writer) Write(p []byte) (int, error) {
 	z.size += uint32(len(p))
 	z.digest = crc32.Update(z.digest, crc32.IEEETable, p)
 	if z.level == StatelessCompression {
-		return len(p), flate.StatelessDeflate(z.w, p, false)
+		return len(p), flate.StatelessDeflate(z.w, p, false, nil)
 	}
 	n, z.err = z.compressor.Write(p)
 	return n, z.err
@ -255,7 +255,7 @@ func (z *Writer) Close() error {
 		}
 	}
 	if z.level == StatelessCompression {
-		z.err = flate.StatelessDeflate(z.w, nil, true)
+		z.err = flate.StatelessDeflate(z.w, nil, true, nil)
 	} else {
 		z.err = z.compressor.Close()
 	}