crypto/internal/fips/sha3: reduce s390x divergence

It's a little annoying, but we can fit the IBM instructions on top of the regular state, avoiding more intrusive interventions. Going forward we should not accept assembly that replaces the whole implementation, because it doubles the work to do any refactoring like the one in this chain. Also, it took me a while to find the specification of these instructions, which should have been linked from the source for the next person who'd have to touch this. Finally, it's really painful to test this without a LUCI TryBot, per #67307. For #69536 Change-Id: I90632a90f06b2aa2e863967de972b12dbaa5b2ae Reviewed-on: https://go-review.googlesource.com/c/go/+/617359 LUCI-TryBot-Result: Go LUCI <golang-scoped@luci-project-accounts.iam.gserviceaccount.com> Auto-Submit: Filippo Valsorda <filippo@golang.org> Reviewed-by: Carlos Amedee <carlos@golang.org> Reviewed-by: Daniel McCarney <daniel@binaryparadox.net> Reviewed-by: Roland Shoemaker <roland@golang.org>
2024-10-02 13:29:47 +02:00 · 2024-10-02 13:29:47 +02:00 · d75fb40e52
parent 312e7e9f8a
commit d75fb40e52
12 changed files with 203 additions and 349 deletions
--- a/src/crypto/internal/fips/sha3/_asm/keccakf_amd64_asm.go
+++ b/src/crypto/internal/fips/sha3/_asm/keccakf_amd64_asm.go
@ -101,7 +101,7 @@ const (
 func main() {
 	Package("golang.org/x/crypto/sha3")
-	ConstraintExpr("amd64,!purego,gc")
+	ConstraintExpr("!purego")
 	keccakF1600()
 	Generate()
 }
--- a/src/crypto/internal/fips/sha3/hashes.go
+++ b/src/crypto/internal/fips/sha3/hashes.go
@ -6,22 +6,22 @@ package sha3
 // New224 returns a new Digest computing the SHA3-224 hash.
 func New224() *Digest {
-	return new224()
+	return &Digest{rate: rateK448, outputLen: 28, dsbyte: dsbyteSHA3}
 }
 // New256 returns a new Digest computing the SHA3-256 hash.
 func New256() *Digest {
-	return new256()
+	return &Digest{rate: rateK512, outputLen: 32, dsbyte: dsbyteSHA3}
 }
 // New384 returns a new Digest computing the SHA3-384 hash.
 func New384() *Digest {
-	return new384()
+	return &Digest{rate: rateK768, outputLen: 48, dsbyte: dsbyteSHA3}
 }
 // New512 returns a new Digest computing the SHA3-512 hash.
 func New512() *Digest {
-	return new512()
+	return &Digest{rate: rateK1024, outputLen: 64, dsbyte: dsbyteSHA3}
 }
 // TODO(fips): do this in the stdlib crypto/sha3 package.
@ -46,22 +46,6 @@ const (
 	rateK1024 = (1600 - 1024) / 8
 )
 func new224Generic() *Digest {
 	return &Digest{rate: rateK448, outputLen: 28, dsbyte: dsbyteSHA3}
 }
 func new256Generic() *Digest {
 	return &Digest{rate: rateK512, outputLen: 32, dsbyte: dsbyteSHA3}
 }
 func new384Generic() *Digest {
 	return &Digest{rate: rateK768, outputLen: 48, dsbyte: dsbyteSHA3}
 }
 func new512Generic() *Digest {
 	return &Digest{rate: rateK1024, outputLen: 64, dsbyte: dsbyteSHA3}
 }
 // NewLegacyKeccak256 returns a new Digest computing the legacy, non-standard
 // Keccak-256 hash.
 func NewLegacyKeccak256() *Digest {
--- a/src/crypto/internal/fips/sha3/keccakf.go
+++ b/src/crypto/internal/fips/sha3/keccakf.go
@ -2,11 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build !amd64 || purego || !gc
 package sha3
-import "math/bits"
+import (
 	"internal/byteorder"
 	"internal/goarch"
 	"math/bits"
 	"unsafe"
 )
 // rc stores the round constants for use in the ι step.
 var rc = [24]uint64{
@ -36,9 +39,23 @@ var rc = [24]uint64{
 	0x8000000080008008,
 }
-// keccakF1600 applies the Keccak permutation to a 1600b-wide
+// keccakF1600Generic applies the Keccak permutation.
-// state represented as a slice of 25 uint64s.
+func keccakF1600Generic(da *[200]byte) {
-func keccakF1600(a *[25]uint64) {
+	var a *[25]uint64
 	if goarch.BigEndian {
 		a = new([25]uint64)
 		for i := range a {
 			a[i] = byteorder.LeUint64(da[i*8:])
 		}
 		defer func() {
 			for i := range a {
 				byteorder.LePutUint64(da[i*8:], a[i])
 			}
 		}()
 	} else {
 		a = (*[25]uint64)(unsafe.Pointer(da))
 	}
 	// Implementation translated from Keccak-inplace.c
 	// in the keccak reference code.
 	var t, bc0, bc1, bc2, bc3, bc4, d0, d1, d2, d3, d4 uint64
--- a/src/crypto/internal/fips/sha3/keccakf_amd64.go
+++ b/src/crypto/internal/fips/sha3/keccakf_amd64.go
@ -1,13 +0,0 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build amd64 && !purego && gc
 package sha3
 // This function is implemented in keccakf_amd64.s.
 //go:noescape
 func keccakF1600(a *[25]uint64)
--- a/src/crypto/internal/fips/sha3/sha3.go
+++ b/src/crypto/internal/fips/sha3/sha3.go
@ -13,9 +13,6 @@ package sha3
 import (
 	"crypto/internal/fips/subtle"
 	"errors"
 	"internal/byteorder"
 	"internal/goarch"
 	"unsafe"
 )
 // spongeDirection indicates the direction bytes are flowing through the sponge.
@ -77,24 +74,8 @@ func (d *Digest) Clone() *Digest {
 // permute applies the KeccakF-1600 permutation.
 func (d *Digest) permute() {
-	var a *[25]uint64
+	keccakF1600(&d.a)
 	if goarch.BigEndian {
 		a = new([25]uint64)
 		for i := range a {
 			a[i] = byteorder.LeUint64(d.a[i*8:])
 		}
 	} else {
 		a = (*[25]uint64)(unsafe.Pointer(&d.a))
 	}
 	keccakF1600(a)
 	d.n = 0
 	if goarch.BigEndian {
 		for i := range a {
 			byteorder.LePutUint64(d.a[i*8:], a[i])
 		}
 	}
 }
 // padAndPermute appends the domain separation bits in dsbyte, applies
@ -115,7 +96,8 @@ func (d *Digest) padAndPermute() {
 }
 // Write absorbs more data into the hash's state.
-func (d *Digest) Write(p []byte) (n int, err error) {
+func (d *Digest) Write(p []byte) (n int, err error) { return d.write(p) }
 func (d *Digest) writeGeneric(p []byte) (n int, err error) {
 	if d.state != spongeAbsorbing {
 		panic("sha3: Write after Read")
 	}
@ -137,7 +119,7 @@ func (d *Digest) Write(p []byte) (n int, err error) {
 }
 // read squeezes an arbitrary number of bytes from the sponge.
-func (d *Digest) read(out []byte) (n int, err error) {
+func (d *Digest) readGeneric(out []byte) (n int, err error) {
 	// If we're still absorbing, pad and apply the permutation.
 	if d.state == spongeAbsorbing {
 		d.padAndPermute()
@ -162,7 +144,8 @@ func (d *Digest) read(out []byte) (n int, err error) {
 // Sum appends the current hash to b and returns the resulting slice.
 // It does not change the underlying hash state.
-func (d *Digest) Sum(b []byte) []byte {
+func (d *Digest) Sum(b []byte) []byte { return d.sum(b) }
 func (d *Digest) sumGeneric(b []byte) []byte {
 	if d.state != spongeAbsorbing {
 		panic("sha3: Sum after Read")
 	}
--- a/src/crypto/internal/fips/sha3/sha3_amd64.go
+++ b/src/crypto/internal/fips/sha3/sha3_amd64.go
@ -0,0 +1,20 @@
 // Copyright 2015 The Go Authors. All rights reserved.
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 //go:build !purego
 package sha3
 //go:noescape
 func keccakF1600(a *[200]byte)
 func (d *Digest) write(p []byte) (n int, err error) {
 	return d.writeGeneric(p)
 }
 func (d *Digest) read(out []byte) (n int, err error) {
 	return d.readGeneric(out)
 }
 func (d *Digest) sum(b []byte) []byte {
 	return d.sumGeneric(b)
 }
--- a/src/crypto/internal/fips/sha3/keccakf_amd64.s
+++ b/src/crypto/internal/fips/sha3/keccakf_amd64.s
@ -1,8 +1,8 @@
 // Code generated by command: go run keccakf_amd64_asm.go -out ../keccakf_amd64.s -pkg sha3. DO NOT EDIT.
-//go:build amd64 && !purego && gc
+//go:build !purego
-// func keccakF1600(a *[25]uint64)
+// func keccakF1600(a *[200]byte)
 TEXT ·keccakF1600(SB), $200-8
 	MOVQ a+0(FP), DI
--- a/src/crypto/internal/fips/sha3/sha3_noasm.go
+++ b/src/crypto/internal/fips/sha3/sha3_noasm.go
@ -2,30 +2,20 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-//go:build !gc || purego || !s390x || !ignore
+//go:build (!amd64 && !s390x) || purego
 package sha3
-func new224() *Digest {
+func keccakF1600(a *[200]byte) {
-	return new224Generic()
+	keccakF1600Generic(a)
 }
-func new256() *Digest {
+func (d *Digest) write(p []byte) (n int, err error) {
-	return new256Generic()
+	return d.writeGeneric(p)
 }
-
+func (d *Digest) read(out []byte) (n int, err error) {
-func new384() *Digest {
+	return d.readGeneric(out)
 	return new384Generic()
 }
-
+func (d *Digest) sum(b []byte) []byte {
-func new512() *Digest {
+	return d.sumGeneric(b)
 	return new512Generic()
 }
 func newShake128() *SHAKE {
 	return newShake128Generic()
 }
 func newShake256() *SHAKE {
 	return newShake256Generic()
 }
--- a/src/crypto/internal/fips/sha3/sha3_s390x.go
+++ b/src/crypto/internal/fips/sha3/sha3_s390x.go
@ -2,298 +2,188 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-//go:build gc && !purego && ignore
+//go:build !purego
 package sha3
 import (
 	"crypto/internal/fips/subtle"
 	"internal/cpu"
 )
 // This file contains code for using the 'compute intermediate
 // message digest' (KIMD) and 'compute last message digest' (KLMD)
-// instructions to compute SHA-3 and SHAKE hashes on IBM Z.
+// instructions to compute SHA-3 and SHAKE hashes on IBM Z. See
 // [z/Architecture Principles of Operation, Fourteen Edition].
 //
 // [z/Architecture Principles of Operation, Fourteen Edition]: https://www.ibm.com/docs/en/module_1678991624569/pdf/SA22-7832-13.pdf
-import "internal/cpu"
+func keccakF1600(a *[200]byte) {
 	keccakF1600Generic(a)
 }
 // codes represent 7-bit KIMD/KLMD function codes as defined in
 // the Principles of Operation.
 type code uint64
 const (
-	// function codes for KIMD/KLMD
+	// Function codes for KIMD/KLMD, from Figure 7-207.
 	sha3_224  code = 32
-	sha3_256       = 33
+	sha3_256  code = 33
-	sha3_384       = 34
+	sha3_384  code = 34
-	sha3_512       = 35
+	sha3_512  code = 35
-	shake_128      = 36
+	shake_128 code = 36
-	shake_256      = 37
+	shake_256 code = 37
 	nopad          = 0x100
 )
 // kimd is a wrapper for the 'compute intermediate message digest' instruction.
-// src must be a multiple of the rate for the given function code.
+// src is absorbed into the sponge state a.
 // len(src) must be a multiple of the rate for the given function code.
 //
 //go:noescape
-func kimd(function code, chain *[200]byte, src []byte)
+func kimd(function code, a *[200]byte, src []byte)
 // klmd is a wrapper for the 'compute last message digest' instruction.
-// src padding is handled by the instruction.
+// src is padded and absorbed into the sponge state a.
 //
 // If the function is a SHAKE XOF, the sponge is then optionally squeezed into
 // dst by first applying the permutation and then copying the output until dst
 // runs out. If len(dst) is a multiple of rate (including zero), the final
 // permutation is not applied. If the nopad bit of function is set and len(src)
 // is zero, only squeezing is performed.
 //
 //go:noescape
-func klmd(function code, chain *[200]byte, dst, src []byte)
+func klmd(function code, a *[200]byte, dst, src []byte)
-type asmState struct {
+func (d *Digest) write(p []byte) (n int, err error) {
-	a         [200]byte       // 1600 bit state
+	if d.state != spongeAbsorbing {
 	buf       []byte          // care must be taken to ensure cap(buf) is a multiple of rate
 	rate      int             // equivalent to block size
 	storage   [3072]byte      // underlying storage for buf
 	outputLen int             // output length for full security
 	function  code            // KIMD/KLMD function code
 	state     spongeDirection // whether the sponge is absorbing or squeezing
 }
 func newAsmState(function code) *asmState {
 	var s asmState
 	s.function = function
 	switch function {
 	case sha3_224:
 		s.rate = 144
 		s.outputLen = 28
 	case sha3_256:
 		s.rate = 136
 		s.outputLen = 32
 	case sha3_384:
 		s.rate = 104
 		s.outputLen = 48
 	case sha3_512:
 		s.rate = 72
 		s.outputLen = 64
 	case shake_128:
 		s.rate = 168
 		s.outputLen = 32
 	case shake_256:
 		s.rate = 136
 		s.outputLen = 64
 	default:
 		panic("sha3: unrecognized function code")
 	}
 	// limit s.buf size to a multiple of s.rate
 	s.resetBuf()
 	return &s
 }
 func (s *asmState) clone() *asmState {
 	c := *s
 	c.buf = c.storage[:len(s.buf):cap(s.buf)]
 	return &c
 }
 // copyIntoBuf copies b into buf. It will panic if there is not enough space to
 // store all of b.
 func (s *asmState) copyIntoBuf(b []byte) {
 	bufLen := len(s.buf)
 	s.buf = s.buf[:len(s.buf)+len(b)]
 	copy(s.buf[bufLen:], b)
 }
 // resetBuf points buf at storage, sets the length to 0 and sets cap to be a
 // multiple of the rate.
 func (s *asmState) resetBuf() {
 	max := (cap(s.storage) / s.rate) * s.rate
 	s.buf = s.storage[:0:max]
 }
 // Write (via the embedded io.Writer interface) adds more data to the running hash.
 // It never returns an error.
 func (s *asmState) Write(b []byte) (int, error) {
 	if s.state != spongeAbsorbing {
 		panic("sha3: Write after Read")
 	}
-	length := len(b)
+	if !cpu.S390X.HasSHA3 {
-	for len(b) > 0 {
+		return d.writeGeneric(p)
 		if len(s.buf) == 0 && len(b) >= cap(s.buf) {
 			// Hash the data directly and push any remaining bytes
 			// into the buffer.
 			remainder := len(b) % s.rate
 			kimd(s.function, &s.a, b[:len(b)-remainder])
 			if remainder != 0 {
 				s.copyIntoBuf(b[len(b)-remainder:])
 			}
 			return length, nil
 		}
 		if len(s.buf) == cap(s.buf) {
 			// flush the buffer
 			kimd(s.function, &s.a, s.buf)
 			s.buf = s.buf[:0]
 		}
 		// copy as much as we can into the buffer
 		n := len(b)
 		if len(b) > cap(s.buf)-len(s.buf) {
 			n = cap(s.buf) - len(s.buf)
 		}
 		s.copyIntoBuf(b[:n])
 		b = b[n:]
 	}
 	return length, nil
 }
 // Read squeezes an arbitrary number of bytes from the sponge.
 func (s *asmState) Read(out []byte) (n int, err error) {
 	// The 'compute last message digest' instruction only stores the digest
 	// at the first operand (dst) for SHAKE functions.
 	if s.function != shake_128 && s.function != shake_256 {
 		panic("sha3: can only call Read for SHAKE functions")
 	}
-	n = len(out)
+	n = len(p)
-	// need to pad if we were absorbing
+	// If there is buffered input in the state, keep XOR'ing.
-	if s.state == spongeAbsorbing {
+	if d.n > 0 {
-		s.state = spongeSqueezing
+		x := subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
-
+		d.n += x
-		// write hash directly into out if possible
+		p = p[x:]
 		if len(out)%s.rate == 0 {
 			klmd(s.function, &s.a, out, s.buf) // len(out) may be 0
 			s.buf = s.buf[:0]
 			return
 		}
 		// write hash into buffer
 		max := cap(s.buf)
 		if max > len(out) {
 			max = (len(out)/s.rate)*s.rate + s.rate
 		}
 		klmd(s.function, &s.a, s.buf[:max], s.buf)
 		s.buf = s.buf[:max]
 	}
-	for len(out) > 0 {
+	// If the sponge is full, apply the permutation.
-		// flush the buffer
+	if d.n == d.rate {
-		if len(s.buf) != 0 {
+		// Absorbing a "rate"ful of zeroes effectively XORs the state with
-			c := copy(out, s.buf)
+		// zeroes (a no-op) and then runs the permutation. The actual function
-			out = out[c:]
+		// doesn't matter, they all run the same permutation.
-			s.buf = s.buf[c:]
+		kimd(shake_128, &d.a, make([]byte, rateK256))
-			continue
+		d.n = 0
 		}
 		// write hash directly into out if possible
 		if len(out)%s.rate == 0 {
 			klmd(s.function|nopad, &s.a, out, nil)
 			return
 		}
 		// write hash into buffer
 		s.resetBuf()
 		if cap(s.buf) > len(out) {
 			s.buf = s.buf[:(len(out)/s.rate)*s.rate+s.rate]
 		}
 		klmd(s.function|nopad, &s.a, s.buf, nil)
 	}
 	// Absorb full blocks with KIMD.
 	if len(p) >= d.rate {
 		wholeBlocks := len(p) / d.rate * d.rate
 		kimd(d.function(), &d.a, p[:wholeBlocks])
 		p = p[wholeBlocks:]
 	}
 	// If there is any trailing input, XOR it into the state.
 	if len(p) > 0 {
 		d.n += subtle.XORBytes(d.a[d.n:d.rate], d.a[d.n:d.rate], p)
 	}
 	return
 }
-// Sum appends the current hash to b and returns the resulting slice.
+func (d *Digest) sum(b []byte) []byte {
-// It does not change the underlying hash state.
+	if d.state != spongeAbsorbing {
 func (s *asmState) Sum(b []byte) []byte {
 	if s.state != spongeAbsorbing {
 		panic("sha3: Sum after Read")
 	}
 	if !cpu.S390X.HasSHA3 ||
 		d.dsbyte != dsbyteSHA3 && d.dsbyte != dsbyteShake {
 		return d.sumGeneric(b)
 	}
 	// Copy the state to preserve the original.
-	a := s.a
+	a := d.a
-	// Hash the buffer. Note that we don't clear it because we
+	// We "absorb" a buffer of zeroes as long as the amount of input we already
-	// aren't updating the state.
+	// XOR'd into the sponge, to skip over it. The max cap is specified to avoid
-	switch s.function {
+	// an allocation.
 	buf := make([]byte, d.n, rateK256)
 	function := d.function()
 	switch function {
 	case sha3_224, sha3_256, sha3_384, sha3_512:
-		klmd(s.function, &a, nil, s.buf)
+		klmd(function, &a, nil, buf)
-		return append(b, a[:s.outputLen]...)
+		return append(b, a[:d.outputLen]...)
 	case shake_128, shake_256:
-		d := make([]byte, s.outputLen, 64)
+		h := make([]byte, d.outputLen, 64)
-		klmd(s.function, &a, d, s.buf)
+		klmd(function, &a, h, buf)
-		return append(b, d[:s.outputLen]...)
+		return append(b, h...)
 	default:
 		panic("sha3: unknown function")
 	}
 }
-// Reset resets the Hash to its initial state.
+func (d *Digest) read(out []byte) (n int, err error) {
-func (s *asmState) Reset() {
+	if !cpu.S390X.HasSHA3 || d.dsbyte != dsbyteShake {
-	for i := range s.a {
+		return d.readGeneric(out)
 		s.a[i] = 0
 	}
 	s.resetBuf()
 	s.state = spongeAbsorbing
 }
-// Size returns the number of bytes Sum will return.
+	n = len(out)
 func (s *asmState) Size() int {
 	return s.outputLen
 }
-// BlockSize returns the hash's underlying block size.
+	if d.state == spongeAbsorbing {
-// The Write method must be able to accept any amount
+		d.state = spongeSqueezing
 // of data, but it may operate more efficiently if all writes
 // are a multiple of the block size.
 func (s *asmState) BlockSize() int {
 	return s.rate
 }
-// Clone returns a copy of the ShakeHash in its current state.
+		// We "absorb" a buffer of zeroes as long as the amount of input we
-func (s *asmState) Clone() ShakeHash {
+		// already XOR'd into the sponge, to skip over it. The max cap is
-	return s.clone()
+		// specified to avoid an allocation.
-}
+		buf := make([]byte, d.n, rateK256)
 		klmd(d.function(), &d.a, out, buf)
 	} else {
 		// We have "buffered" output still to copy.
 		if d.n < d.rate {
 			x := copy(out, d.a[d.n:d.rate])
 			d.n += x
 			out = out[x:]
 		}
 		if len(out) == 0 {
 			return
 		}
-// new224 returns an assembly implementation of SHA3-224 if available,
+		klmd(d.function()|nopad, &d.a, out, nil)
 // otherwise it returns a generic implementation.
 func new224() *Digest {
 	if cpu.S390X.HasSHA3 {
 		return newAsmState(sha3_224)
 	}
-	return new224Generic()
+
 	if len(out)%d.rate == 0 {
 		// The final permutation was not performed,
 		// so there is no "buffered" output.
 		d.n = d.rate
 	} else {
 		d.n = len(out) % d.rate
 	}
 	return
 }
-// new256 returns an assembly implementation of SHA3-256 if available,
+func (d *Digest) function() code {
-// otherwise it returns a generic implementation.
+	switch d.rate {
-func new256() *Digest {
+	case rateK256:
-	if cpu.S390X.HasSHA3 {
+		return shake_128
-		return newAsmState(sha3_256)
+	case rateK448:
 		return sha3_224
 	case rateK512:
 		if d.dsbyte == dsbyteSHA3 {
 			return sha3_256
 		} else {
 			return shake_256
 		}
 	case rateK768:
 		return sha3_384
 	case rateK1024:
 		return sha3_512
 	default:
 		panic("invalid rate")
 	}
 	return new256Generic()
 }
 // new384 returns an assembly implementation of SHA3-384 if available,
 // otherwise it returns a generic implementation.
 func new384() *Digest {
 	if cpu.S390X.HasSHA3 {
 		return newAsmState(sha3_384)
 	}
 	return new384Generic()
 }
 // new512 returns an assembly implementation of SHA3-512 if available,
 // otherwise it returns a generic implementation.
 func new512() *Digest {
 	if cpu.S390X.HasSHA3 {
 		return newAsmState(sha3_512)
 	}
 	return new512Generic()
 }
 // newShake128 returns an assembly implementation of SHAKE-128 if available,
 // otherwise it returns a generic implementation.
 func newShake128() ShakeHash {
 	if cpu.S390X.HasSHA3 {
 		return newAsmState(shake_128)
 	}
 	return newShake128Generic()
 }
 // newShake256 returns an assembly implementation of SHAKE-256 if available,
 // otherwise it returns a generic implementation.
 func newShake256() ShakeHash {
 	if cpu.S390X.HasSHA3 {
 		return newAsmState(shake_256)
 	}
 	return newShake256Generic()
 }
--- a/src/crypto/internal/fips/sha3/sha3_s390x.s
+++ b/src/crypto/internal/fips/sha3/sha3_s390x.s
@ -2,14 +2,14 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
-//go:build gc && !purego && ignore
+//go:build !purego
 #include "textflag.h"
-// func kimd(function code, chain *[200]byte, src []byte)
+// func kimd(function code, a *[200]byte, src []byte)
 TEXT ·kimd(SB), NOFRAME|NOSPLIT, $0-40
 	MOVD function+0(FP), R0
-	MOVD chain+8(FP), R1
+	MOVD a+8(FP), R1
 	LMG  src+16(FP), R2, R3 // R2=base, R3=len
 continue:
@ -18,11 +18,10 @@ continue:
 	MOVD $0, R0      // reset R0 for pre-go1.8 compilers
 	RET
-// func klmd(function code, chain *[200]byte, dst, src []byte)
+// func klmd(function code, a *[200]byte, dst, src []byte)
 TEXT ·klmd(SB), NOFRAME|NOSPLIT, $0-64
 	// TODO: SHAKE support
 	MOVD function+0(FP), R0
-	MOVD chain+8(FP), R1
+	MOVD a+8(FP), R1
 	LMG  dst+16(FP), R2, R3 // R2=base, R3=len
 	LMG  src+40(FP), R4, R5 // R4=base, R5=len
--- a/src/crypto/internal/fips/sha3/sha3_test.go
+++ b/src/crypto/internal/fips/sha3/sha3_test.go
@ -14,7 +14,6 @@ import (
 	"internal/testenv"
 	"io"
 	"math/rand"
 	"runtime"
 	"strings"
 	"testing"
 )
@ -262,6 +261,7 @@ func TestSqueezing(t *testing.T) {
 		d1.Write([]byte(testString))
 		var multiple []byte
 		for range ref {
 			d1.Read(make([]byte, 0))
 			one := make([]byte, 1)
 			d1.Read(one)
 			multiple = append(multiple, one...)
@ -338,14 +338,6 @@ var sink byte
 func TestAllocations(t *testing.T) {
 	testenv.SkipIfOptimizationOff(t)
 	want := 0.0
 	if runtime.GOARCH == "s390x" {
 		// On s390x the returned hash.Hash is conditional so it escapes.
 		want = 3.0
 	}
 	t.Run("New", func(t *testing.T) {
 		if allocs := testing.AllocsPerRun(10, func() {
 			h := New256()
@ -354,7 +346,7 @@ func TestAllocations(t *testing.T) {
 			out := make([]byte, 0, 32)
 			out = h.Sum(out)
 			sink ^= out[0]
-		}); allocs > want {
+		}); allocs > 0 {
 			t.Errorf("expected zero allocations, got %0.1f", allocs)
 		}
 	})
@ -368,7 +360,7 @@ func TestAllocations(t *testing.T) {
 			sink ^= out[0]
 			h.Read(out)
 			sink ^= out[0]
-		}); allocs > want {
+		}); allocs > 0 {
 			t.Errorf("expected zero allocations, got %0.1f", allocs)
 		}
 	})
@ -377,7 +369,7 @@ func TestAllocations(t *testing.T) {
 			b := []byte("ABC")
 			out := Sum256(b)
 			sink ^= out[0]
-		}); allocs > want {
+		}); allocs > 0 {
 			t.Errorf("expected zero allocations, got %0.1f", allocs)
 		}
 	})
--- a/src/crypto/internal/fips/sha3/shake.go
+++ b/src/crypto/internal/fips/sha3/shake.go
@ -116,19 +116,11 @@ func (s *SHAKE) UnmarshalBinary(b []byte) error {
 // NewShake128 creates a new SHAKE128 XOF.
 func NewShake128() *SHAKE {
-	return newShake128()
+	return &SHAKE{d: Digest{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake}}
 }
 // NewShake256 creates a new SHAKE256 XOF.
 func NewShake256() *SHAKE {
 	return newShake256()
 }
 func newShake128Generic() *SHAKE {
 	return &SHAKE{d: Digest{rate: rateK256, outputLen: 32, dsbyte: dsbyteShake}}
 }
 func newShake256Generic() *SHAKE {
 	return &SHAKE{d: Digest{rate: rateK512, outputLen: 64, dsbyte: dsbyteShake}}
 }