From 6180f3c8841bad9184e2d9d43c6829eb9b7a63e5 Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Thu, 9 Apr 2020 12:44:45 +0200
Subject: [PATCH 1/3] compress/flate: Improve decompression speed
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Improve decompression speed, mainly through 3 optimizations:

1) Take advantage of the fact that we can read further ahead when we know current block isn't the last.

The reader guarantees that it will not read beyond the end of the stream.
This poses limitations on the decoder in terms of how far it can read ahead and is set to the size of an end-of-block marker in `f.h1.min = f.bits[endBlockMarker]`.

We can however take advantage of the fact that each block gives information on whether it is the final block on a stream. So if we are not reading the final block we can safely add the size of the smallest block possible with nothing but an EOB marker.

That is a block with a predefined table and a single EOB marker. Since we know the size of the block header and the encoding of the EOB this totals to 10 additional bits. Adding 10 bits reduces the number of stream reads significantly.

Approximately 5% throughput increase.

2) Manually inline f.huffSym call

This change by itself give about about 13% throughput increase.

3) Generate decoders for stdlib io.ByteReader types

We generate decoders for the known implementations of `io.ByteReader`, namely `*bytes.Buffer`, `*bytes.Reader`, `*bufio.Reader` and `*strings.Reader`.

This change by itself gives about 20-25% throughput increase, including when an `io.Reader` is passed.

I would say only `*strings.Reader` probably isn't that common.

Minor changes:

* Reuse `h.chunks` and `h.links`.
* Trade some bounds checks for AND operations.
* Change chunks from uint32 to uint16.
* Avoid padding of decompressor struct members.

Per loop allocation removed from benchmarks. The numbers in the benchmark below includes this change for the 'old' numbers.

```
name                              old time/op    new time/op    delta
Decode/Digits/Huffman/1e4-32        78.0µs ± 0%    50.5µs ± 1%   -35.26%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e5-32         779µs ± 2%     487µs ± 0%   -37.48%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e6-32        7.68ms ± 0%    4.88ms ± 1%   -36.44%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e4-32          88.5µs ± 1%    59.9µs ± 1%   -32.33%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e5-32           963µs ± 1%     678µs ± 1%   -29.58%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e6-32          9.75ms ± 1%    6.90ms ± 0%   -29.21%  (p=0.008 n=5+5)
Decode/Digits/Default/1e4-32        91.2µs ± 1%    61.4µs ± 0%   -32.72%  (p=0.008 n=5+5)
Decode/Digits/Default/1e5-32         954µs ± 0%     675µs ± 0%   -29.25%  (p=0.008 n=5+5)
Decode/Digits/Default/1e6-32        9.67ms ± 0%    6.79ms ± 1%   -29.76%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e4-32    90.7µs ± 1%    61.5µs ± 1%   -32.21%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e5-32     953µs ± 1%     672µs ± 0%   -29.46%  (p=0.016 n=4+5)
Decode/Digits/Compression/1e6-32    9.76ms ± 4%    6.78ms ± 0%   -30.54%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e4-32        90.4µs ± 0%    54.7µs ± 0%   -39.52%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e5-32         885µs ± 0%     538µs ± 0%   -39.19%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e6-32        8.84ms ± 0%    5.44ms ± 0%   -38.46%  (p=0.016 n=4+5)
Decode/Newton/Speed/1e4-32          81.5µs ± 0%    55.1µs ± 1%   -32.42%  (p=0.016 n=4+5)
Decode/Newton/Speed/1e5-32           751µs ± 4%     528µs ± 0%   -29.70%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e6-32          7.49ms ± 2%    5.32ms ± 0%   -28.92%  (p=0.008 n=5+5)
Decode/Newton/Default/1e4-32        73.3µs ± 1%    48.9µs ± 1%   -33.36%  (p=0.008 n=5+5)
Decode/Newton/Default/1e5-32         601µs ± 2%     418µs ± 0%   -30.40%  (p=0.008 n=5+5)
Decode/Newton/Default/1e6-32        5.92ms ± 0%    4.17ms ± 0%   -29.60%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e4-32    72.7µs ± 0%    48.5µs ± 0%   -33.21%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e5-32     597µs ± 0%     418µs ± 0%   -29.90%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e6-32    5.90ms ± 0%    4.15ms ± 0%   -29.63%  (p=0.016 n=4+5)

name                              old speed      new speed      delta
Decode/Digits/Huffman/1e4-32       128MB/s ± 0%   198MB/s ± 1%   +54.46%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e5-32       128MB/s ± 2%   205MB/s ± 0%   +59.92%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e6-32       130MB/s ± 0%   205MB/s ± 1%   +57.33%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e4-32         113MB/s ± 1%   167MB/s ± 1%   +47.79%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e5-32         104MB/s ± 1%   147MB/s ± 1%   +42.01%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e6-32         103MB/s ± 1%   145MB/s ± 0%   +41.26%  (p=0.008 n=5+5)
Decode/Digits/Default/1e4-32       110MB/s ± 1%   163MB/s ± 0%   +48.63%  (p=0.008 n=5+5)
Decode/Digits/Default/1e5-32       105MB/s ± 0%   148MB/s ± 0%   +41.34%  (p=0.008 n=5+5)
Decode/Digits/Default/1e6-32       103MB/s ± 0%   147MB/s ± 1%   +42.37%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e4-32   110MB/s ± 1%   163MB/s ± 1%   +47.51%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e5-32   105MB/s ± 1%   149MB/s ± 0%   +41.77%  (p=0.016 n=4+5)
Decode/Digits/Compression/1e6-32   102MB/s ± 4%   147MB/s ± 0%   +43.91%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e4-32       111MB/s ± 0%   183MB/s ± 0%   +65.35%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e5-32       113MB/s ± 0%   186MB/s ± 0%   +64.44%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e6-32       113MB/s ± 0%   184MB/s ± 0%   +62.50%  (p=0.016 n=4+5)
Decode/Newton/Speed/1e4-32         123MB/s ± 0%   182MB/s ± 1%   +47.98%  (p=0.016 n=4+5)
Decode/Newton/Speed/1e5-32         133MB/s ± 4%   189MB/s ± 0%   +42.20%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e6-32         134MB/s ± 2%   188MB/s ± 0%   +40.67%  (p=0.008 n=5+5)
Decode/Newton/Default/1e4-32       136MB/s ± 1%   205MB/s ± 1%   +50.05%  (p=0.008 n=5+5)
Decode/Newton/Default/1e5-32       166MB/s ± 2%   239MB/s ± 0%   +43.67%  (p=0.008 n=5+5)
Decode/Newton/Default/1e6-32       169MB/s ± 0%   240MB/s ± 0%   +42.04%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e4-32   138MB/s ± 0%   206MB/s ± 0%   +49.73%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e5-32   168MB/s ± 0%   239MB/s ± 0%   +42.66%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e6-32   170MB/s ± 0%   241MB/s ± 0%   +42.11%  (p=0.016 n=4+5)

name                              old alloc/op   new alloc/op   delta
Decode/Digits/Huffman/1e4-32        0.00B ±NaN%    16.00B ± 0%     +Inf%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e5-32         7.60B ± 8%    32.00B ± 0%  +321.05%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e6-32         79.6B ± 1%    264.0B ± 0%  +231.66%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e4-32           80.0B ± 0%     16.0B ± 0%   -80.00%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e5-32            297B ± 0%       33B ± 0%      ~     (p=0.079 n=4+5)
Decode/Digits/Speed/1e6-32          3.86kB ± 0%    0.27kB ± 0%   -92.98%  (p=0.008 n=5+5)
Decode/Digits/Default/1e4-32         48.0B ± 0%     16.0B ± 0%   -66.67%  (p=0.008 n=5+5)
Decode/Digits/Default/1e5-32          297B ± 0%       49B ± 0%   -83.50%  (p=0.008 n=5+5)
Decode/Digits/Default/1e6-32        4.28kB ± 0%    0.38kB ± 0%      ~     (p=0.079 n=4+5)
Decode/Digits/Compression/1e4-32     48.0B ± 0%     16.0B ± 0%   -66.67%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e5-32      297B ± 0%       49B ± 0%      ~     (p=0.079 n=4+5)
Decode/Digits/Compression/1e6-32    4.28kB ± 0%    0.38kB ± 0%   -91.09%  (p=0.000 n=4+5)
Decode/Newton/Huffman/1e4-32          705B ± 0%       16B ± 0%   -97.73%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e5-32        4.50kB ± 0%    0.03kB ± 0%   -99.27%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e6-32        39.4kB ± 0%     0.3kB ± 0%   -99.29%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e4-32            625B ± 0%       16B ± 0%   -97.44%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e5-32          3.21kB ± 0%    0.03kB ± 0%   -98.97%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e6-32          40.6kB ± 0%     0.3kB ± 0%   -99.25%  (p=0.008 n=5+5)
Decode/Newton/Default/1e4-32          513B ± 0%       16B ± 0%   -96.88%  (p=0.008 n=5+5)
Decode/Newton/Default/1e5-32        2.37kB ± 0%    0.03kB ± 0%   -98.61%  (p=0.008 n=5+5)
Decode/Newton/Default/1e6-32        21.2kB ± 0%     0.2kB ± 0%   -98.97%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e4-32      513B ± 0%       16B ± 0%   -96.88%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e5-32    2.37kB ± 0%    0.03kB ± 0%   -98.61%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e6-32    23.0kB ± 0%     0.2kB ± 0%   -99.07%  (p=0.008 n=5+5)

name                              old allocs/op  new allocs/op  delta
Decode/Digits/Huffman/1e4-32         0.00 ±NaN%      1.00 ± 0%     +Inf%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e5-32         0.00 ±NaN%      2.00 ± 0%     +Inf%  (p=0.008 n=5+5)
Decode/Digits/Huffman/1e6-32         0.00 ±NaN%     16.00 ± 0%     +Inf%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e4-32            3.00 ± 0%      1.00 ± 0%   -66.67%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e5-32            6.00 ± 0%      2.00 ± 0%   -66.67%  (p=0.008 n=5+5)
Decode/Digits/Speed/1e6-32            68.0 ± 0%      16.0 ± 0%   -76.47%  (p=0.008 n=5+5)
Decode/Digits/Default/1e4-32          2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.008 n=5+5)
Decode/Digits/Default/1e5-32          8.00 ± 0%      3.00 ± 0%   -62.50%  (p=0.008 n=5+5)
Decode/Digits/Default/1e6-32          74.0 ± 0%      23.0 ± 0%   -68.92%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e4-32      2.00 ± 0%      1.00 ± 0%   -50.00%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e5-32      8.00 ± 0%      3.00 ± 0%   -62.50%  (p=0.008 n=5+5)
Decode/Digits/Compression/1e6-32      74.0 ± 0%      23.0 ± 0%   -68.92%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e4-32          9.00 ± 0%      1.00 ± 0%   -88.89%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e5-32          18.0 ± 0%       2.0 ± 0%   -88.89%  (p=0.008 n=5+5)
Decode/Newton/Huffman/1e6-32           156 ± 0%        16 ± 0%   -89.74%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e4-32            13.0 ± 0%       1.0 ± 0%   -92.31%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e5-32            26.0 ± 0%       2.0 ± 0%   -92.31%  (p=0.008 n=5+5)
Decode/Newton/Speed/1e6-32             223 ± 0%        16 ± 0%   -92.83%  (p=0.008 n=5+5)
Decode/Newton/Default/1e4-32          10.0 ± 0%       1.0 ± 0%   -90.00%  (p=0.008 n=5+5)
Decode/Newton/Default/1e5-32          27.0 ± 0%       2.0 ± 0%   -92.59%  (p=0.008 n=5+5)
Decode/Newton/Default/1e6-32           153 ± 0%        12 ± 0%   -92.16%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e4-32      10.0 ± 0%       1.0 ± 0%   -90.00%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e5-32      27.0 ± 0%       2.0 ± 0%   -92.59%  (p=0.008 n=5+5)
Decode/Newton/Compression/1e6-32       145 ± 0%        12 ± 0%   -91.72%  (p=0.008 n=5+5)
```

These changes have been included in https://github.com/klauspost/compress for a little more than a month now, which includes fuzz testing.

Change-Id: I7e346330512116baa27e448aa606a2f4e551054c
---
 src/compress/flate/gen_inflate.go | 259 ++++++++++
 src/compress/flate/inflate.go     | 172 +++++--
 src/compress/flate/inflate_gen.go | 829 ++++++++++++++++++++++++++++++
 src/compress/flate/reader_test.go |   6 +-
 4 files changed, 1220 insertions(+), 46 deletions(-)
 create mode 100644 src/compress/flate/gen_inflate.go
 create mode 100644 src/compress/flate/inflate_gen.go

diff --git a/src/compress/flate/gen_inflate.go b/src/compress/flate/gen_inflate.go
new file mode 100644
index 0000000000..9db11f325e
--- /dev/null
+++ b/src/compress/flate/gen_inflate.go
@@ -0,0 +1,259 @@
+// Copyright 2020 The Go Authors. All rights reserved.
+// Use of this source code is governed by a BSD-style
+// license that can be found in the LICENSE file.
+
+// Package flate implements the DEFLATE compressed data format, described in
+// RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
+// formats.
+
+// +build generate
+
+//go:generate go run $GOFILE && gofmt -w inflate_gen.go
+
+package main
+
+import (
+	"os"
+	"strings"
+)
+
+func main() {
+	f, err := os.Create("inflate_gen.go")
+	if err != nil {
+		panic(err)
+	}
+	defer f.Close()
+	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
+	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
+	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
+	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+`)
+
+	for _, imp := range imports {
+		f.WriteString("\t\"" + imp + "\"\n")
+	}
+	f.WriteString(")\n\n")
+
+	template := `
+
+// $FUNCNAME$ decodes a single Huffman block from f.
+// f.r must be a $TYPE$.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) $FUNCNAME$() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.($TYPE$)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).$FUNCNAME$
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+`
+	for i, t := range types {
+		s := strings.Replace(template, "$FUNCNAME$", "huffman"+names[i], -1)
+		s = strings.Replace(s, "$TYPE$", t, -1)
+		f.WriteString(s)
+	}
+	f.WriteString("func (f *decompressor) huffmanBlockDecoder() func() {\n")
+	f.WriteString("\tswitch f.r.(type) {\n")
+	for i, t := range types {
+		f.WriteString("\t\tcase " + t + ":\n")
+		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
+	}
+	f.WriteString("\t\tdefault:\n")
+	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
+	f.WriteString("\t}\n}\n")
+}
diff --git a/src/compress/flate/inflate.go b/src/compress/flate/inflate.go
index 49921398e2..0145ee8567 100644
--- a/src/compress/flate/inflate.go
+++ b/src/compress/flate/inflate.go
@@ -16,7 +16,8 @@ import (
 )
 
 const (
-	maxCodeLen = 16 // max length of Huffman code
+	maxCodeLen     = 16 // max length of Huffman code
+	maxCodeLenMask = 15 // mask for max length of Huffman code
 	// The next three numbers come from the RFC section 3.2.7, with the
 	// additional proviso in section 3.2.5 which implies that distance codes
 	// 30 and 31 should never occur in compressed data.
@@ -102,10 +103,10 @@ const (
 )
 
 type huffmanDecoder struct {
-	min      int                      // the minimum code length
-	chunks   [huffmanNumChunks]uint32 // chunks as described above
-	links    [][]uint32               // overflow links
-	linkMask uint32                   // mask the width of the link table
+	maxRead  int                       // the maximum number of bits we can read and not overread
+	chunks   *[huffmanNumChunks]uint16 // chunks as described above
+	links    [][]uint16                // overflow links
+	linkMask uint32                    // mask the width of the link table
 }
 
 // Initialize Huffman decoding tables from array of code lengths.
@@ -119,12 +120,15 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	// development to supplement the currently ad-hoc unit tests.
 	const sanity = false
 
-	if h.min != 0 {
-		*h = huffmanDecoder{}
+	if h.chunks == nil {
+		h.chunks = &[huffmanNumChunks]uint16{}
+	}
+	if h.maxRead != 0 {
+		*h = huffmanDecoder{chunks: h.chunks, links: h.links}
 	}
 
 	// Count number of codes of each length,
-	// compute min and max length.
+	// compute maxRead and max length.
 	var count [maxCodeLen]int
 	var min, max int
 	for _, n := range lengths {
@@ -137,7 +141,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 		if n > max {
 			max = n
 		}
-		count[n]++
+		count[n&maxCodeLenMask]++
 	}
 
 	// Empty tree. The decompressor.huffSym function will fail later if the tree
@@ -155,8 +159,8 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 	var nextcode [maxCodeLen]int
 	for i := min; i <= max; i++ {
 		code <<= 1
-		nextcode[i] = code
-		code += count[i]
+		nextcode[i&maxCodeLenMask] = code
+		code += count[i&maxCodeLenMask]
 	}
 
 	// Check that the coding is complete (i.e., that we've
@@ -168,14 +172,23 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 		return false
 	}
 
-	h.min = min
+	h.maxRead = min
+	chunks := h.chunks[:]
+	for i := range chunks {
+		chunks[i] = 0
+	}
+
 	if max > huffmanChunkBits {
 		numLinks := 1 << (uint(max) - huffmanChunkBits)
 		h.linkMask = uint32(numLinks - 1)
 
 		// create link tables
 		link := nextcode[huffmanChunkBits+1] >> 1
-		h.links = make([][]uint32, huffmanNumChunks-link)
+		if cap(h.links) < huffmanNumChunks-link {
+			h.links = make([][]uint16, huffmanNumChunks-link)
+		} else {
+			h.links = h.links[:huffmanNumChunks-link]
+		}
 		for j := uint(link); j < huffmanNumChunks; j++ {
 			reverse := int(bits.Reverse16(uint16(j)))
 			reverse >>= uint(16 - huffmanChunkBits)
@@ -183,9 +196,16 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 			if sanity && h.chunks[reverse] != 0 {
 				panic("impossible: overwriting existing chunk")
 			}
-			h.chunks[reverse] = uint32(off<<huffmanValueShift | (huffmanChunkBits + 1))
-			h.links[off] = make([]uint32, numLinks)
+			h.chunks[reverse] = uint16(off<<huffmanValueShift | (huffmanChunkBits + 1))
+			if cap(h.links[off]) < numLinks {
+				h.links[off] = make([]uint16, numLinks)
+			} else {
+				links := h.links[off][:0]
+				h.links[off] = links[:numLinks]
+			}
 		}
+	} else {
+		h.links = h.links[:0]
 	}
 
 	for i, n := range lengths {
@@ -194,7 +214,7 @@ func (h *huffmanDecoder) init(lengths []int) bool {
 		}
 		code := nextcode[n]
 		nextcode[n]++
-		chunk := uint32(i<<huffmanValueShift | n)
+		chunk := uint16(i<<huffmanValueShift | n)
 		reverse := int(bits.Reverse16(uint16(code)))
 		reverse >>= uint(16 - n)
 		if n <= huffmanChunkBits {
@@ -269,10 +289,6 @@ type decompressor struct {
 	r       Reader
 	roffset int64
 
-	// Input bits, in top of b.
-	b  uint32
-	nb uint
-
 	// Huffman decoders for literal/length, distance.
 	h1, h2 huffmanDecoder
 
@@ -283,19 +299,24 @@ type decompressor struct {
 	// Output history, buffer.
 	dict dictDecoder
 
-	// Temporary buffer (avoids repeated allocation).
-	buf [4]byte
-
 	// Next step in the decompression,
 	// and decompression state.
 	step      func(*decompressor)
 	stepState int
-	final     bool
 	err       error
 	toRead    []byte
 	hl, hd    *huffmanDecoder
 	copyLen   int
 	copyDist  int
+
+	// Temporary buffer (avoids repeated allocation).
+	buf [4]byte
+
+	// Input bits, in top of b.
+	b uint32
+
+	nb    uint
+	final bool
 }
 
 func (f *decompressor) nextBlock() {
@@ -316,7 +337,7 @@ func (f *decompressor) nextBlock() {
 		// compressed, fixed Huffman tables
 		f.hl = &fixedHuffmanDecoder
 		f.hd = nil
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	case 2:
 		// compressed, dynamic Huffman tables
 		if f.err = f.readHuffman(); f.err != nil {
@@ -324,7 +345,7 @@ func (f *decompressor) nextBlock() {
 		}
 		f.hl = &f.h1
 		f.hd = &f.h2
-		f.huffmanBlock()
+		f.huffmanBlockDecoder()()
 	default:
 		// 3 is reserved.
 		f.err = CorruptInputError(f.roffset)
@@ -460,12 +481,18 @@ func (f *decompressor) readHuffman() error {
 		return CorruptInputError(f.roffset)
 	}
 
-	// As an optimization, we can initialize the min bits to read at a time
+	// As an optimization, we can initialize the maxRead bits to read at a time
 	// for the HLIT tree to the length of the EOB marker since we know that
 	// every block must terminate with one. This preserves the property that
 	// we never read any extra bytes after the end of the DEFLATE stream.
-	if f.h1.min < f.bits[endBlockMarker] {
-		f.h1.min = f.bits[endBlockMarker]
+	if f.h1.maxRead < f.bits[endBlockMarker] {
+		f.h1.maxRead = f.bits[endBlockMarker]
+	}
+	if !f.final {
+		// If not the final block, the smallest block possible is
+		// a predefined table, BTYPE=01, with a single EOB marker.
+		// This will take up 3 + 7 bits.
+		f.h1.maxRead += 10
 	}
 
 	return nil
@@ -475,7 +502,7 @@ func (f *decompressor) readHuffman() error {
 // hl and hd are the Huffman states for the lit/length values
 // and the distance values, respectively. If hd == nil, using the
 // fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlock() {
+func (f *decompressor) huffmanBlockGeneric() {
 	const (
 		stateInit = iota // Zero value must be stateInit
 		stateDict
@@ -491,19 +518,61 @@ func (f *decompressor) huffmanBlock() {
 readLiteral:
 	// Read literal and/or (length, distance) according to RFC section 3.2.3.
 	{
-		v, err := f.huffSym(f.hl)
-		if err != nil {
-			f.err = err
-			return
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := f.r.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
 		}
+
 		var n uint // number of bits extra
 		var length int
+		var err error
 		switch {
 		case v < 256:
 			f.dict.writeByte(byte(v))
 			if f.dict.availWrite() == 0 {
 				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlock
+				f.step = (*decompressor).huffmanBlockGeneric
 				f.stepState = stateInit
 				return
 			}
@@ -610,7 +679,7 @@ copyHistory:
 
 		if f.dict.availWrite() == 0 || f.copyLen > 0 {
 			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlock // We need to continue this work
+			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
 			f.stepState = stateDict
 			return
 		}
@@ -621,20 +690,31 @@ copyHistory:
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
-	// Discard current half-byte.
-	f.nb = 0
-	f.b = 0
+	// Discard current partial byte.
+	left := (f.nb) & 7
+	f.nb -= left
+	f.b >>= left
+
+	offBytes := f.nb >> 3
+	// Unfilled values will be overwritten.
+	f.buf[0] = uint8(f.b)
+	f.buf[1] = uint8(f.b >> 8)
+	f.buf[2] = uint8(f.b >> 16)
+	f.buf[3] = uint8(f.b >> 24)
+
+	f.roffset += int64(offBytes)
+	f.nb, f.b = 0, 0
 
 	// Length then ones-complement of length.
-	nr, err := io.ReadFull(f.r, f.buf[0:4])
+	nr, err := io.ReadFull(f.r, f.buf[offBytes:4])
 	f.roffset += int64(nr)
 	if err != nil {
 		f.err = noEOF(err)
 		return
 	}
-	n := int(f.buf[0]) | int(f.buf[1])<<8
-	nn := int(f.buf[2]) | int(f.buf[3])<<8
-	if uint16(nn) != uint16(^n) {
+	n := uint16(f.buf[0]) | uint16(f.buf[1])<<8
+	nn := uint16(f.buf[2]) | uint16(f.buf[3])<<8
+	if nn != ^n {
 		f.err = CorruptInputError(f.roffset)
 		return
 	}
@@ -645,7 +725,7 @@ func (f *decompressor) dataBlock() {
 		return
 	}
 
-	f.copyLen = n
+	f.copyLen = int(n)
 	f.copyData()
 }
 
@@ -709,7 +789,7 @@ func (f *decompressor) huffSym(h *huffmanDecoder) (int, error) {
 	// with single element, huffSym must error on these two edge cases. In both
 	// cases, the chunks slice will be 0 for the invalid sequence, leading it
 	// satisfy the n == 0 check below.
-	n := uint(h.min)
+	n := uint(h.maxRead)
 	// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
 	// inline call to moreBits and reassign b,nb back to f on return.
@@ -778,6 +858,8 @@ func (f *decompressor) Reset(r io.Reader, dict []byte) error {
 		r:        makeReader(r),
 		bits:     f.bits,
 		codebits: f.codebits,
+		h1:       f.h1,
+		h2:       f.h2,
 		dict:     f.dict,
 		step:     (*decompressor).nextBlock,
 	}
diff --git a/src/compress/flate/inflate_gen.go b/src/compress/flate/inflate_gen.go
new file mode 100644
index 0000000000..b7e672751e
--- /dev/null
+++ b/src/compress/flate/inflate_gen.go
@@ -0,0 +1,829 @@
+// Code generated by go generate gen_inflate.go. DO NOT EDIT.
+
+package flate
+
+import (
+	"bufio"
+	"bytes"
+	"math/bits"
+	"strings"
+)
+
+// huffmanBytesBuffer decodes a single Huffman block from f.
+// f.r must be a *bytes.Buffer.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesBuffer() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Buffer)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesBuffer
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// huffmanBytesReader decodes a single Huffman block from f.
+// f.r must be a *bytes.Reader.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBytesReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bytes.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBytesReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// huffmanBufioReader decodes a single Huffman block from f.
+// f.r must be a *bufio.Reader.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanBufioReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*bufio.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanBufioReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+// huffmanStringsReader decodes a single Huffman block from f.
+// f.r must be a *strings.Reader.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanStringsReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(*strings.Reader)
+	moreBits := func() error {
+		c, err := fr.ReadByte()
+		if err != nil {
+			return noEOF(err)
+		}
+		f.roffset++
+		f.b |= uint32(c) << f.nb
+		f.nb += 8
+		return nil
+	}
+
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			nb, b := f.nb, f.b
+			for {
+				for nb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b = b
+						f.nb = nb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					b |= uint32(c) << (nb & 31)
+					nb += 8
+				}
+				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= nb {
+					if n == 0 {
+						f.b = b
+						f.nb = nb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					f.b = b >> (n & 31)
+					f.nb = nb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+
+		var n uint // number of bits extra
+		var length int
+		var err error
+		switch {
+		case v < 256:
+			f.dict.writeByte(byte(v))
+			if f.dict.availWrite() == 0 {
+				f.toRead = f.dict.readFlush()
+				f.step = (*decompressor).huffmanStringsReader
+				f.stepState = stateInit
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+			n = 0
+		case v < 269:
+			length = v*2 - (265*2 - 11)
+			n = 1
+		case v < 273:
+			length = v*4 - (269*4 - 19)
+			n = 2
+		case v < 277:
+			length = v*8 - (273*8 - 35)
+			n = 3
+		case v < 281:
+			length = v*16 - (277*16 - 67)
+			n = 4
+		case v < 285:
+			length = v*32 - (281*32 - 131)
+			n = 5
+		case v < maxNumLit:
+			length = 258
+			n = 0
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		if n > 0 {
+			for f.nb < n {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			length += int(f.b & uint32(1<<n-1))
+			f.b >>= n
+			f.nb -= n
+		}
+
+		var dist int
+		if f.hd == nil {
+			for f.nb < 5 {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
+			f.b >>= 5
+			f.nb -= 5
+		} else {
+			if dist, err = f.huffSym(f.hd); err != nil {
+				f.err = err
+				return
+			}
+		}
+
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << nb
+			for f.nb < nb {
+				if err = moreBits(); err != nil {
+					f.err = err
+					return
+				}
+			}
+			extra |= int(f.b & uint32(1<<nb-1))
+			f.b >>= nb
+			f.nb -= nb
+			dist = 1<<(nb+1) + 1 + extra
+		default:
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		// No check on length; encoding can be prescient.
+		if dist > f.dict.histSize() {
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+
+		f.copyLen, f.copyDist = length, dist
+		goto copyHistory
+	}
+
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+
+		if f.dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = f.dict.readFlush()
+			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
+			f.stepState = stateDict
+			return
+		}
+		goto readLiteral
+	}
+}
+
+func (f *decompressor) huffmanBlockDecoder() func() {
+	switch f.r.(type) {
+	case *bytes.Buffer:
+		return f.huffmanBytesBuffer
+	case *bytes.Reader:
+		return f.huffmanBytesReader
+	case *bufio.Reader:
+		return f.huffmanBufioReader
+	case *strings.Reader:
+		return f.huffmanStringsReader
+	default:
+		return f.huffmanBlockGeneric
+	}
+}
diff --git a/src/compress/flate/reader_test.go b/src/compress/flate/reader_test.go
index 9d2943a540..cac99a8896 100644
--- a/src/compress/flate/reader_test.go
+++ b/src/compress/flate/reader_test.go
@@ -51,10 +51,14 @@ func BenchmarkDecode(b *testing.B) {
 		w.Close()
 		buf1 := compressed.Bytes()
 		buf0, compressed, w = nil, nil, nil
+		src := bytes.NewReader(buf1)
+		dec := NewReader(src)
 		runtime.GC()
 		b.StartTimer()
 		for i := 0; i < b.N; i++ {
-			io.Copy(ioutil.Discard, NewReader(bytes.NewReader(buf1)))
+			src.Reset(buf1)
+			dec.(Resetter).Reset(src, nil)
+			io.Copy(ioutil.Discard, dec)
 		}
 	})
 }

From ae9b62a9851eb3d8acf14cebfef1d8c663e407dc Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Tue, 7 Jun 2022 14:08:33 +0200
Subject: [PATCH 2/3] [klauspost/inflate-improve-speed] * Eliminate length
 branches

* Inline moreBits
* Put values on stack.
* Also generate the fallback.

Change-Id: I64d03424438ebc5dbacd4f364e3e6d3c4936a008
---
 src/compress/flate/gen_inflate.go |  208 +++---
 src/compress/flate/inflate.go     |  204 +-----
 src/compress/flate/inflate_gen.go | 1007 +++++++++++++++++++----------
 3 files changed, 779 insertions(+), 640 deletions(-)

diff --git a/src/compress/flate/gen_inflate.go b/src/compress/flate/gen_inflate.go
index 9db11f325e..a0d1b318bc 100644
--- a/src/compress/flate/gen_inflate.go
+++ b/src/compress/flate/gen_inflate.go
@@ -6,9 +6,11 @@
 // RFC 1951.  The gzip and zlib packages implement access to DEFLATE-based file
 // formats.
 
+//go:build generate
 // +build generate
 
-//go:generate go run $GOFILE && gofmt -w inflate_gen.go
+//go:generate go run $GOFILE
+//go:generate go fmt inflate_gen.go
 
 package main
 
@@ -23,9 +25,9 @@ func main() {
 		panic(err)
 	}
 	defer f.Close()
-	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader"}
-	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader"}
-	imports := []string{"bytes", "bufio", "io", "strings", "math/bits"}
+	types := []string{"*bytes.Buffer", "*bytes.Reader", "*bufio.Reader", "*strings.Reader", "Reader"}
+	names := []string{"BytesBuffer", "BytesReader", "BufioReader", "StringsReader", "GenericReader"}
+	imports := []string{"bytes", "bufio", "strings", "math/bits"}
 	f.WriteString(`// Code generated by go generate gen_inflate.go. DO NOT EDIT.
 
 package flate
@@ -51,17 +53,11 @@ func (f *decompressor) $FUNCNAME$() {
 		stateDict
 	)
 	fr := f.r.($TYPE$)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
 
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -80,165 +76,189 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & 31)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
-
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).$FUNCNAME$
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb&31)
+				fnb += 8	
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
 		default:
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb&31)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
 			}
 		}
-
 		switch {
 		case dist < 4:
 			dist++
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb&31)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
-
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).$FUNCNAME$ // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 `
@@ -254,6 +274,6 @@ copyHistory:
 		f.WriteString("\t\t\treturn f.huffman" + names[i] + "\n")
 	}
 	f.WriteString("\t\tdefault:\n")
-	f.WriteString("\t\t\treturn f.huffmanBlockGeneric")
+	f.WriteString("\t\t\treturn f.huffmanGenericReader")
 	f.WriteString("\t}\n}\n")
 }
diff --git a/src/compress/flate/inflate.go b/src/compress/flate/inflate.go
index 0145ee8567..bcdd8f4871 100644
--- a/src/compress/flate/inflate.go
+++ b/src/compress/flate/inflate.go
@@ -30,6 +30,21 @@ const (
 var fixedOnce sync.Once
 var fixedHuffmanDecoder huffmanDecoder
 
+var bitMask32 = [32]uint32{
+	0, 1, 3, 7, 0xF, 0x1F, 0x3F, 0x7F, 0xFF,
+	0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF, 0xFFFF,
+	0x1ffff, 0x3ffff, 0x7FFFF, 0xfFFFF, 0x1fFFFF, 0x3fFFFF, 0x7fFFFF, 0xffFFFF,
+	0x1ffFFFF, 0x3ffFFFF, 0x7ffFFFF, 0xfffFFFF, 0x1fffFFFF, 0x3fffFFFF, 0x7fffFFFF,
+} // up to 32 bits
+
+// Value of length - 3 and extra bits.
+type lengthExtra struct {
+	length, extra uint8
+}
+
+// decCodeToLen contains fast lookup of each length and the number of extra bits.
+var decCodeToLen = [32]lengthExtra{{length: 0x0, extra: 0x0}, {length: 0x1, extra: 0x0}, {length: 0x2, extra: 0x0}, {length: 0x3, extra: 0x0}, {length: 0x4, extra: 0x0}, {length: 0x5, extra: 0x0}, {length: 0x6, extra: 0x0}, {length: 0x7, extra: 0x0}, {length: 0x8, extra: 0x1}, {length: 0xa, extra: 0x1}, {length: 0xc, extra: 0x1}, {length: 0xe, extra: 0x1}, {length: 0x10, extra: 0x2}, {length: 0x14, extra: 0x2}, {length: 0x18, extra: 0x2}, {length: 0x1c, extra: 0x2}, {length: 0x20, extra: 0x3}, {length: 0x28, extra: 0x3}, {length: 0x30, extra: 0x3}, {length: 0x38, extra: 0x3}, {length: 0x40, extra: 0x4}, {length: 0x50, extra: 0x4}, {length: 0x60, extra: 0x4}, {length: 0x70, extra: 0x4}, {length: 0x80, extra: 0x5}, {length: 0xa0, extra: 0x5}, {length: 0xc0, extra: 0x5}, {length: 0xe0, extra: 0x5}, {length: 0xff, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}, {length: 0x0, extra: 0x0}}
+
 // A CorruptInputError reports the presence of corrupt input at a given offset.
 type CorruptInputError int64
 
@@ -498,195 +513,6 @@ func (f *decompressor) readHuffman() error {
 	return nil
 }
 
-// Decode a single Huffman block from f.
-// hl and hd are the Huffman states for the lit/length values
-// and the distance values, respectively. If hd == nil, using the
-// fixed distance encoding associated with fixed Huffman blocks.
-func (f *decompressor) huffmanBlockGeneric() {
-	const (
-		stateInit = iota // Zero value must be stateInit
-		stateDict
-	)
-
-	switch f.stepState {
-	case stateInit:
-		goto readLiteral
-	case stateDict:
-		goto copyHistory
-	}
-
-readLiteral:
-	// Read literal and/or (length, distance) according to RFC section 3.2.3.
-	{
-		var v int
-		{
-			// Inlined v, err := f.huffSym(f.hl)
-			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
-			// with single element, huffSym must error on these two edge cases. In both
-			// cases, the chunks slice will be 0 for the invalid sequence, leading it
-			// satisfy the n == 0 check below.
-			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
-			for {
-				for nb < n {
-					c, err := f.r.ReadByte()
-					if err != nil {
-						f.b = b
-						f.nb = nb
-						f.err = noEOF(err)
-						return
-					}
-					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
-				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
-				n = uint(chunk & huffmanCountMask)
-				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
-					n = uint(chunk & huffmanCountMask)
-				}
-				if n <= nb {
-					if n == 0 {
-						f.b = b
-						f.nb = nb
-						f.err = CorruptInputError(f.roffset)
-						return
-					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
-					v = int(chunk >> huffmanValueShift)
-					break
-				}
-			}
-		}
-
-		var n uint // number of bits extra
-		var length int
-		var err error
-		switch {
-		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
-				f.step = (*decompressor).huffmanBlockGeneric
-				f.stepState = stateInit
-				return
-			}
-			goto readLiteral
-		case v == 256:
-			f.finishBlock()
-			return
-		// otherwise, reference to older data
-		case v < 265:
-			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
-		case v < maxNumLit:
-			length = 258
-			n = 0
-		default:
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-		if n > 0 {
-			for f.nb < n {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
-		if f.hd == nil {
-			for f.nb < 5 {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
-		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
-			}
-		}
-
-		switch {
-		case dist < 4:
-			dist++
-		case dist < maxNumDist:
-			nb := uint(dist-2) >> 1
-			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = f.moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
-		default:
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
-			f.err = CorruptInputError(f.roffset)
-			return
-		}
-
-		f.copyLen, f.copyDist = length, dist
-		goto copyHistory
-	}
-
-copyHistory:
-	// Perform a backwards copy according to RFC section 3.2.3.
-	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
-		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
-		}
-		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
-			f.step = (*decompressor).huffmanBlockGeneric // We need to continue this work
-			f.stepState = stateDict
-			return
-		}
-		goto readLiteral
-	}
-}
-
 // Copy a single uncompressed data block from input to output.
 func (f *decompressor) dataBlock() {
 	// Uncompressed.
diff --git a/src/compress/flate/inflate_gen.go b/src/compress/flate/inflate_gen.go
index b7e672751e..3740f81240 100644
--- a/src/compress/flate/inflate_gen.go
+++ b/src/compress/flate/inflate_gen.go
@@ -20,17 +20,11 @@ func (f *decompressor) huffmanBytesBuffer() {
 		stateDict
 	)
 	fr := f.r.(*bytes.Buffer)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
 
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -49,165 +43,189 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & 31)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
-
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesBuffer
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
 		default:
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
 			}
 		}
-
 		switch {
 		case dist < 4:
 			dist++
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
-
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesBuffer // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // huffmanBytesReader decodes a single Huffman block from f.
@@ -221,17 +239,11 @@ func (f *decompressor) huffmanBytesReader() {
 		stateDict
 	)
 	fr := f.r.(*bytes.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
 
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -250,165 +262,189 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & 31)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
-
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBytesReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
 		default:
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
 			}
 		}
-
 		switch {
 		case dist < 4:
 			dist++
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
-
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBytesReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // huffmanBufioReader decodes a single Huffman block from f.
@@ -422,17 +458,11 @@ func (f *decompressor) huffmanBufioReader() {
 		stateDict
 	)
 	fr := f.r.(*bufio.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
 
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -451,165 +481,189 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & 31)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
-
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanBufioReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
 		default:
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
 			}
 		}
-
 		switch {
 		case dist < 4:
 			dist++
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
-
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanBufioReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
 }
 
 // huffmanStringsReader decodes a single Huffman block from f.
@@ -623,17 +677,11 @@ func (f *decompressor) huffmanStringsReader() {
 		stateDict
 	)
 	fr := f.r.(*strings.Reader)
-	moreBits := func() error {
-		c, err := fr.ReadByte()
-		if err != nil {
-			return noEOF(err)
-		}
-		f.roffset++
-		f.b |= uint32(c) << f.nb
-		f.nb += 8
-		return nil
-	}
 
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
 		goto readLiteral
@@ -652,165 +700,408 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hl.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
-			nb, b := f.nb, f.b
 			for {
-				for nb < n {
+				for fnb < n {
 					c, err := fr.ReadByte()
 					if err != nil {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = noEOF(err)
 						return
 					}
 					f.roffset++
-					b |= uint32(c) << (nb & 31)
-					nb += 8
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
 				}
-				chunk := f.hl.chunks[b&(huffmanNumChunks-1)]
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
 				n = uint(chunk & huffmanCountMask)
 				if n > huffmanChunkBits {
-					chunk = f.hl.links[chunk>>huffmanValueShift][(b>>huffmanChunkBits)&f.hl.linkMask]
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
 					n = uint(chunk & huffmanCountMask)
 				}
-				if n <= nb {
+				if n <= fnb {
 					if n == 0 {
-						f.b = b
-						f.nb = nb
+						f.b, f.nb = fb, fnb
 						f.err = CorruptInputError(f.roffset)
 						return
 					}
-					f.b = b >> (n & 31)
-					f.nb = nb - n
+					fb = fb >> (n & 31)
+					fnb = fnb - n
 					v = int(chunk >> huffmanValueShift)
 					break
 				}
 			}
 		}
-
-		var n uint // number of bits extra
 		var length int
-		var err error
 		switch {
 		case v < 256:
-			f.dict.writeByte(byte(v))
-			if f.dict.availWrite() == 0 {
-				f.toRead = f.dict.readFlush()
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
 				f.step = (*decompressor).huffmanStringsReader
 				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
 				return
 			}
 			goto readLiteral
 		case v == 256:
+			f.b, f.nb = fb, fnb
 			f.finishBlock()
 			return
 		// otherwise, reference to older data
 		case v < 265:
 			length = v - (257 - 3)
-			n = 0
-		case v < 269:
-			length = v*2 - (265*2 - 11)
-			n = 1
-		case v < 273:
-			length = v*4 - (269*4 - 19)
-			n = 2
-		case v < 277:
-			length = v*8 - (273*8 - 35)
-			n = 3
-		case v < 281:
-			length = v*16 - (277*16 - 67)
-			n = 4
-		case v < 285:
-			length = v*32 - (281*32 - 131)
-			n = 5
 		case v < maxNumLit:
-			length = 258
-			n = 0
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
 		default:
 			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
 			return
 		}
-		if n > 0 {
-			for f.nb < n {
-				if err = moreBits(); err != nil {
-					f.err = err
-					return
-				}
-			}
-			length += int(f.b & uint32(1<<n-1))
-			f.b >>= n
-			f.nb -= n
-		}
-
-		var dist int
+		var dist uint32
 		if f.hd == nil {
-			for f.nb < 5 {
-				if err = moreBits(); err != nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			dist = int(bits.Reverse8(uint8(f.b & 0x1F << 3)))
-			f.b >>= 5
-			f.nb -= 5
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
 		} else {
-			if dist, err = f.huffSym(f.hd); err != nil {
-				f.err = err
-				return
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
 			}
 		}
-
 		switch {
 		case dist < 4:
 			dist++
 		case dist < maxNumDist:
 			nb := uint(dist-2) >> 1
 			// have 1 bit in bottom of dist, need nb more.
-			extra := (dist & 1) << nb
-			for f.nb < nb {
-				if err = moreBits(); err != nil {
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
 					f.err = err
 					return
 				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
 			}
-			extra |= int(f.b & uint32(1<<nb-1))
-			f.b >>= nb
-			f.nb -= nb
-			dist = 1<<(nb+1) + 1 + extra
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
 		default:
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
 		// No check on length; encoding can be prescient.
-		if dist > f.dict.histSize() {
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
 			f.err = CorruptInputError(f.roffset)
 			return
 		}
-
-		f.copyLen, f.copyDist = length, dist
+		f.copyLen, f.copyDist = length, int(dist)
 		goto copyHistory
 	}
-
 copyHistory:
 	// Perform a backwards copy according to RFC section 3.2.3.
 	{
-		cnt := f.dict.tryWriteCopy(f.copyDist, f.copyLen)
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
 		if cnt == 0 {
-			cnt = f.dict.writeCopy(f.copyDist, f.copyLen)
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
 		}
 		f.copyLen -= cnt
-
-		if f.dict.availWrite() == 0 || f.copyLen > 0 {
-			f.toRead = f.dict.readFlush()
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
 			f.step = (*decompressor).huffmanStringsReader // We need to continue this work
 			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
 			return
 		}
 		goto readLiteral
 	}
+	// Not reached
+}
+
+// huffmanGenericReader decodes a single Huffman block from f.
+// f.r must be a Reader.
+// hl and hd are the Huffman states for the lit/length values
+// and the distance values, respectively. If hd == nil, using the
+// fixed distance encoding associated with fixed Huffman blocks.
+func (f *decompressor) huffmanGenericReader() {
+	const (
+		stateInit = iota // Zero value must be stateInit
+		stateDict
+	)
+	fr := f.r.(Reader)
+
+	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
+	// but is smart enough to keep local variables in registers, so use nb and b,
+	// inline call to moreBits and reassign b, nb back to f on return.
+	fnb, fb, dict := f.nb, f.b, &f.dict
+	switch f.stepState {
+	case stateInit:
+		goto readLiteral
+	case stateDict:
+		goto copyHistory
+	}
+
+readLiteral:
+	// Read literal and/or (length, distance) according to RFC section 3.2.3.
+	{
+		var v int
+		{
+			// Inlined v, err := f.huffSym(f.hl)
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hl.maxRead)
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hl.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hl.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hl.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					v = int(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+		var length int
+		switch {
+		case v < 256:
+			dict.writeByte(byte(v))
+			if dict.availWrite() == 0 {
+				f.toRead = dict.readFlush()
+				f.step = (*decompressor).huffmanGenericReader
+				f.stepState = stateInit
+				f.b, f.nb = fb, fnb
+				return
+			}
+			goto readLiteral
+		case v == 256:
+			f.b, f.nb = fb, fnb
+			f.finishBlock()
+			return
+		// otherwise, reference to older data
+		case v < 265:
+			length = v - (257 - 3)
+		case v < maxNumLit:
+			val := decCodeToLen[(v - 257)]
+			length = int(val.length) + 3
+			n := uint(val.extra)
+			for fnb < n {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			length += int(fb & bitMask32[n])
+			fb >>= n & 31
+			fnb -= n
+		default:
+			f.err = CorruptInputError(f.roffset)
+			f.b, f.nb = fb, fnb
+			return
+		}
+		var dist uint32
+		if f.hd == nil {
+			for fnb < 5 {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			dist = uint32(bits.Reverse8(uint8(fb & 0x1F << 3)))
+			fb >>= 5
+			fnb -= 5
+		} else {
+			// Since a huffmanDecoder can be empty or be composed of a degenerate tree
+			// with single element, huffSym must error on these two edge cases. In both
+			// cases, the chunks slice will be 0 for the invalid sequence, leading it
+			// satisfy the n == 0 check below.
+			n := uint(f.hd.maxRead)
+			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
+			// but is smart enough to keep local variables in registers, so use nb and b,
+			// inline call to moreBits and reassign b,nb back to f on return.
+			for {
+				for fnb < n {
+					c, err := fr.ReadByte()
+					if err != nil {
+						f.b, f.nb = fb, fnb
+						f.err = noEOF(err)
+						return
+					}
+					f.roffset++
+					fb |= uint32(c) << (fnb & 31)
+					fnb += 8
+				}
+				chunk := f.hd.chunks[fb&(huffmanNumChunks-1)]
+				n = uint(chunk & huffmanCountMask)
+				if n > huffmanChunkBits {
+					chunk = f.hd.links[chunk>>huffmanValueShift][(fb>>huffmanChunkBits)&f.hd.linkMask]
+					n = uint(chunk & huffmanCountMask)
+				}
+				if n <= fnb {
+					if n == 0 {
+						f.b, f.nb = fb, fnb
+						f.err = CorruptInputError(f.roffset)
+						return
+					}
+					fb = fb >> (n & 31)
+					fnb = fnb - n
+					dist = uint32(chunk >> huffmanValueShift)
+					break
+				}
+			}
+		}
+		switch {
+		case dist < 4:
+			dist++
+		case dist < maxNumDist:
+			nb := uint(dist-2) >> 1
+			// have 1 bit in bottom of dist, need nb more.
+			extra := (dist & 1) << (nb & 31)
+			for fnb < nb {
+				c, err := fr.ReadByte()
+				if err != nil {
+					f.b, f.nb = fb, fnb
+					f.err = err
+					return
+				}
+				f.roffset++
+				fb |= uint32(c) << (fnb & 31)
+				fnb += 8
+			}
+			extra |= fb & bitMask32[nb]
+			fb >>= nb & 31
+			fnb -= nb
+			dist = 1<<((nb+1)&31) + 1 + extra
+		default:
+			f.b, f.nb = fb, fnb
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		// No check on length; encoding can be prescient.
+		if dist > uint32(dict.histSize()) {
+			f.b, f.nb = fb, fnb
+			f.err = CorruptInputError(f.roffset)
+			return
+		}
+		f.copyLen, f.copyDist = length, int(dist)
+		goto copyHistory
+	}
+copyHistory:
+	// Perform a backwards copy according to RFC section 3.2.3.
+	{
+		cnt := dict.tryWriteCopy(f.copyDist, f.copyLen)
+		if cnt == 0 {
+			cnt = dict.writeCopy(f.copyDist, f.copyLen)
+		}
+		f.copyLen -= cnt
+		if dict.availWrite() == 0 || f.copyLen > 0 {
+			f.toRead = dict.readFlush()
+			f.step = (*decompressor).huffmanGenericReader // We need to continue this work
+			f.stepState = stateDict
+			f.b, f.nb = fb, fnb
+			return
+		}
+		goto readLiteral
+	}
+	// Not reached
 }
 
 func (f *decompressor) huffmanBlockDecoder() func() {
@@ -823,7 +1114,9 @@ func (f *decompressor) huffmanBlockDecoder() func() {
 		return f.huffmanBufioReader
 	case *strings.Reader:
 		return f.huffmanStringsReader
+	case Reader:
+		return f.huffmanGenericReader
 	default:
-		return f.huffmanBlockGeneric
+		return f.huffmanGenericReader
 	}
 }

From 161f02171c05a37751c7ca314daf5e5b4809905f Mon Sep 17 00:00:00 2001
From: Klaus Post <klauspost@gmail.com>
Date: Fri, 10 Jun 2022 13:07:14 +0200
Subject: [PATCH 3/3] [klauspost/inflate-improve-speed] Fix up comments

Change-Id: If11b81d2de23a2588f3d4c7baa088ed5d614de70
---
 src/compress/flate/gen_inflate.go |  6 ++----
 src/compress/flate/inflate_gen.go | 30 ++++++++++--------------------
 2 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/src/compress/flate/gen_inflate.go b/src/compress/flate/gen_inflate.go
index a0d1b318bc..d5c9a0b94e 100644
--- a/src/compress/flate/gen_inflate.go
+++ b/src/compress/flate/gen_inflate.go
@@ -56,7 +56,7 @@ func (f *decompressor) $FUNCNAME$() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -125,6 +125,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits 
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -171,9 +172,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()
diff --git a/src/compress/flate/inflate_gen.go b/src/compress/flate/inflate_gen.go
index 3740f81240..1ab2fa7c45 100644
--- a/src/compress/flate/inflate_gen.go
+++ b/src/compress/flate/inflate_gen.go
@@ -23,7 +23,7 @@ func (f *decompressor) huffmanBytesBuffer() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -92,6 +92,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -138,9 +139,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()
@@ -242,7 +240,7 @@ func (f *decompressor) huffmanBytesReader() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -311,6 +309,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -357,9 +356,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()
@@ -461,7 +457,7 @@ func (f *decompressor) huffmanBufioReader() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -530,6 +526,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -576,9 +573,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()
@@ -680,7 +674,7 @@ func (f *decompressor) huffmanStringsReader() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -749,6 +743,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -795,9 +790,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()
@@ -899,7 +891,7 @@ func (f *decompressor) huffmanGenericReader() {
 
 	// Optimization. Compiler isn't smart enough to keep f.b, f.nb in registers,
 	// but is smart enough to keep local variables in registers, so use nb and b,
-	// inline call to moreBits and reassign b, nb back to f on return.
+	// dict reference and reassign b, nb back to f on return.
 	fnb, fb, dict := f.nb, f.b, &f.dict
 	switch f.stepState {
 	case stateInit:
@@ -968,6 +960,7 @@ readLiteral:
 			return
 		// otherwise, reference to older data
 		case v < 265:
+			// No extra bits
 			length = v - (257 - 3)
 		case v < maxNumLit:
 			val := decCodeToLen[(v - 257)]
@@ -1014,9 +1007,6 @@ readLiteral:
 			// cases, the chunks slice will be 0 for the invalid sequence, leading it
 			// satisfy the n == 0 check below.
 			n := uint(f.hd.maxRead)
-			// Optimization. Compiler isn't smart enough to keep f.b,f.nb in registers,
-			// but is smart enough to keep local variables in registers, so use nb and b,
-			// inline call to moreBits and reassign b,nb back to f on return.
 			for {
 				for fnb < n {
 					c, err := fr.ReadByte()