diff --git a/src/hash/crc32/crc32_generic.go b/src/hash/crc32/crc32_generic.go index 62fa72028c..10a6367bc9 100644 --- a/src/hash/crc32/crc32_generic.go +++ b/src/hash/crc32/crc32_generic.go @@ -2,7 +2,7 @@ // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. -// +build !amd64,!amd64p32 +// +build !amd64,!amd64p32,!s390x package crc32 diff --git a/src/hash/crc32/crc32_s390x.go b/src/hash/crc32/crc32_s390x.go new file mode 100644 index 0000000000..2f20690389 --- /dev/null +++ b/src/hash/crc32/crc32_s390x.go @@ -0,0 +1,101 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +package crc32 + +import ( + "unsafe" +) + +const ( + vxMinLen = 64 + vxAlignment = 16 + vxAlignMask = vxAlignment - 1 +) + +// hasVectorFacility reports whether the machine has the z/Architecture +// vector facility installed and enabled. +func hasVectorFacility() bool + +var hasVX = hasVectorFacility() + +// vectorizedCastagnoli implements CRC32 using vector instructions. +// It is defined in crc32_s390x.s. +//go:noescape +func vectorizedCastagnoli(crc uint32, p []byte) uint32 + +// vectorizedIEEE implements CRC32 using vector instructions. +// It is defined in crc32_s390x.s. +//go:noescape +func vectorizedIEEE(crc uint32, p []byte) uint32 + +func genericCastagnoli(crc uint32, p []byte) uint32 { + // Use slicing-by-8 on larger inputs. + if len(p) >= sliceBy8Cutoff { + return updateSlicingBy8(crc, castagnoliTable8, p) + } + return update(crc, castagnoliTable, p) +} + +func genericIEEE(crc uint32, p []byte) uint32 { + // Use slicing-by-8 on larger inputs. + if len(p) >= sliceBy8Cutoff { + ieeeTable8Once.Do(func() { + ieeeTable8 = makeTable8(IEEE) + }) + return updateSlicingBy8(crc, ieeeTable8, p) + } + return update(crc, IEEETable, p) +} + +// updateCastagnoli calculates the checksum of p using genericCastagnoli to +// align the data appropriately for vectorCastagnoli. It avoids using +// vectorCastagnoli entirely if the length of p is less than or equal to +// vxMinLen. +func updateCastagnoli(crc uint32, p []byte) uint32 { + // Use vectorized function if vector facility is available and + // data length is above threshold. + if hasVX && len(p) > vxMinLen { + pAddr := uintptr(unsafe.Pointer(&p[0])) + if pAddr&vxAlignMask != 0 { + prealign := vxAlignment - int(pAddr&vxAlignMask) + crc = genericCastagnoli(crc, p[:prealign]) + p = p[prealign:] + } + aligned := len(p) & ^vxAlignMask + crc = vectorizedCastagnoli(crc, p[:aligned]) + p = p[aligned:] + // process remaining data + if len(p) > 0 { + crc = genericCastagnoli(crc, p) + } + return crc + } + return genericCastagnoli(crc, p) +} + +// updateIEEE calculates the checksum of p using genericIEEE to align the data +// appropriately for vectorIEEE. It avoids using vectorIEEE entirely if the length +// of p is less than or equal to vxMinLen. +func updateIEEE(crc uint32, p []byte) uint32 { + // Use vectorized function if vector facility is available and + // data length is above threshold. + if hasVX && len(p) > vxMinLen { + pAddr := uintptr(unsafe.Pointer(&p[0])) + if pAddr&vxAlignMask != 0 { + prealign := vxAlignment - int(pAddr&vxAlignMask) + crc = genericIEEE(crc, p[:prealign]) + p = p[prealign:] + } + aligned := len(p) & ^vxAlignMask + crc = vectorizedIEEE(crc, p[:aligned]) + p = p[aligned:] + // process remaining data + if len(p) > 0 { + crc = genericIEEE(crc, p) + } + return crc + } + return genericIEEE(crc, p) +} diff --git a/src/hash/crc32/crc32_s390x.s b/src/hash/crc32/crc32_s390x.s new file mode 100644 index 0000000000..f8d39f3df9 --- /dev/null +++ b/src/hash/crc32/crc32_s390x.s @@ -0,0 +1,245 @@ +// Copyright 2016 The Go Authors. All rights reserved. +// Use of this source code is governed by a BSD-style +// license that can be found in the LICENSE file. + +#include "textflag.h" + +// Vector register range containing CRC-32 constants + +#define CONST_PERM_LE2BE V9 +#define CONST_R2R1 V10 +#define CONST_R4R3 V11 +#define CONST_R5 V12 +#define CONST_RU_POLY V13 +#define CONST_CRC_POLY V14 + + +// The CRC-32 constant block contains reduction constants to fold and +// process particular chunks of the input data stream in parallel. +// +// Note that the constant definitions below are extended in order to compute +// intermediate results with a single VECTOR GALOIS FIELD MULTIPLY instruction. +// The rightmost doubleword can be 0 to prevent contribution to the result or +// can be multiplied by 1 to perform an XOR without the need for a separate +// VECTOR EXCLUSIVE OR instruction. +// +// The polynomials used are bit-reflected: +// +// IEEE: P'(x) = 0x0edb88320 +// Castagnoli: P'(x) = 0x082f63b78 + + +// IEEE polynomial constants +DATA ·crclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask +DATA ·crclecons+8(SB)/8, $0x0706050403020100 +DATA ·crclecons+16(SB)/8, $0x00000001c6e41596 // R2 +DATA ·crclecons+24(SB)/8, $0x0000000154442bd4 // R1 +DATA ·crclecons+32(SB)/8, $0x00000000ccaa009e // R4 +DATA ·crclecons+40(SB)/8, $0x00000001751997d0 // R3 +DATA ·crclecons+48(SB)/8, $0x0000000000000000 +DATA ·crclecons+56(SB)/8, $0x0000000163cd6124 // R5 +DATA ·crclecons+64(SB)/8, $0x0000000000000000 +DATA ·crclecons+72(SB)/8, $0x00000001F7011641 // u' +DATA ·crclecons+80(SB)/8, $0x0000000000000000 +DATA ·crclecons+88(SB)/8, $0x00000001DB710641 // P'(x) << 1 + +GLOBL ·crclecons(SB),RODATA, $144 + +// Castagonli Polynomial constants +DATA ·crcclecons+0(SB)/8, $0x0F0E0D0C0B0A0908 // LE-to-BE mask +DATA ·crcclecons+8(SB)/8, $0x0706050403020100 +DATA ·crcclecons+16(SB)/8, $0x000000009e4addf8 // R2 +DATA ·crcclecons+24(SB)/8, $0x00000000740eef02 // R1 +DATA ·crcclecons+32(SB)/8, $0x000000014cd00bd6 // R4 +DATA ·crcclecons+40(SB)/8, $0x00000000f20c0dfe // R3 +DATA ·crcclecons+48(SB)/8, $0x0000000000000000 +DATA ·crcclecons+56(SB)/8, $0x00000000dd45aab8 // R5 +DATA ·crcclecons+64(SB)/8, $0x0000000000000000 +DATA ·crcclecons+72(SB)/8, $0x00000000dea713f1 // u' +DATA ·crcclecons+80(SB)/8, $0x0000000000000000 +DATA ·crcclecons+88(SB)/8, $0x0000000105ec76f0 // P'(x) << 1 + +GLOBL ·crcclecons(SB),RODATA, $144 + +// func hasVectorFacility() bool +TEXT ·hasVectorFacility(SB),NOSPLIT,$24-1 + MOVD $x-24(SP), R1 + XC $24, 0(R1), 0(R1) // clear the storage + MOVD $2, R0 // R0 is the number of double words stored -1 + WORD $0xB2B01000 // STFLE 0(R1) + XOR R0, R0 // reset the value of R0 + MOVBZ z-8(SP), R1 + AND $0x40, R1 + BEQ novector +vectorinstalled: + // check if the vector instruction has been enabled + VLEIB $0, $0xF, V16 + VLGVB $0, V16, R1 + CMPBNE R1, $0xF, novector + MOVB $1, ret+0(FP) // have vx + RET +novector: + MOVB $0, ret+0(FP) // no vx + RET + + +// The CRC-32 function(s) use these calling conventions: +// +// Parameters: +// +// R2: Initial CRC value, typically ~0; and final CRC (return) value. +// R3: Input buffer pointer, performance might be improved if the +// buffer is on a doubleword boundary. +// R4: Length of the buffer, must be 64 bytes or greater. +// +// Register usage: +// +// R5: CRC-32 constant pool base pointer. +// V0: Initial CRC value and intermediate constants and results. +// V1..V4: Data for CRC computation. +// V5..V8: Next data chunks that are fetched from the input buffer. +// +// V9..V14: CRC-32 constants. + +// func vectorizedIEEE(crc uint32, p []byte) uint32 +TEXT ·vectorizedIEEE(SB),NOSPLIT,$0 + MOVWZ crc+0(FP), R2 // R2 stores the CRC value + MOVD p+8(FP), R3 // data pointer + MOVD p_len+16(FP), R4 // len(p) + + MOVD $·crclecons(SB), R5 + BR vectorizedBody<>(SB) + +// func vectorizedCastagnoli(crc uint32, p []byte) uint32 +TEXT ·vectorizedCastagnoli(SB),NOSPLIT,$0 + MOVWZ crc+0(FP), R2 // R2 stores the CRC value + MOVD p+8(FP), R3 // data pointer + MOVD p_len+16(FP), R4 // len(p) + + // R5: crc-32 constant pool base pointer, constant is used to reduce crc + MOVD $·crcclecons(SB), R5 + BR vectorizedBody<>(SB) + +TEXT vectorizedBody<>(SB),NOSPLIT,$0 + XOR $0xffffffff, R2 // NOTW R2 + VLM 0(R5), CONST_PERM_LE2BE, CONST_CRC_POLY + + // Load the initial CRC value into the rightmost word of V0 + VZERO V0 + VLVGF $3, R2, V0 + + // Load a 64-byte data chunk and XOR with CRC + VLM 0(R3), V1, V4 // 64-bytes into V1..V4 + + // Reflect the data if the CRC operation is in the bit-reflected domain + VPERM V1, V1, CONST_PERM_LE2BE, V1 + VPERM V2, V2, CONST_PERM_LE2BE, V2 + VPERM V3, V3, CONST_PERM_LE2BE, V3 + VPERM V4, V4, CONST_PERM_LE2BE, V4 + + VX V0, V1, V1 // V1 ^= CRC + ADD $64, R3 // BUF = BUF + 64 + ADD $(-64), R4 + + // Check remaining buffer size and jump to proper folding method + CMP R4, $64 + BLT less_than_64bytes + +fold_64bytes_loop: + // Load the next 64-byte data chunk into V5 to V8 + VLM 0(R3), V5, V8 + VPERM V5, V5, CONST_PERM_LE2BE, V5 + VPERM V6, V6, CONST_PERM_LE2BE, V6 + VPERM V7, V7, CONST_PERM_LE2BE, V7 + VPERM V8, V8, CONST_PERM_LE2BE, V8 + + + // Perform a GF(2) multiplication of the doublewords in V1 with + // the reduction constants in V0. The intermediate result is + // then folded (accumulated) with the next data chunk in V5 and + // stored in V1. Repeat this step for the register contents + // in V2, V3, and V4 respectively. + + VGFMAG CONST_R2R1, V1, V5, V1 + VGFMAG CONST_R2R1, V2, V6, V2 + VGFMAG CONST_R2R1, V3, V7, V3 + VGFMAG CONST_R2R1, V4, V8 ,V4 + + // Adjust buffer pointer and length for next loop + ADD $64, R3 // BUF = BUF + 64 + ADD $(-64), R4 // LEN = LEN - 64 + + CMP R4, $64 + BGE fold_64bytes_loop + +less_than_64bytes: + // Fold V1 to V4 into a single 128-bit value in V1 + VGFMAG CONST_R4R3, V1, V2, V1 + VGFMAG CONST_R4R3, V1, V3, V1 + VGFMAG CONST_R4R3, V1, V4, V1 + + // Check whether to continue with 64-bit folding + CMP R4, $16 + BLT final_fold + +fold_16bytes_loop: + VL 0(R3), V2 // Load next data chunk + VPERM V2, V2, CONST_PERM_LE2BE, V2 + + VGFMAG CONST_R4R3, V1, V2, V1 // Fold next data chunk + + // Adjust buffer pointer and size for folding next data chunk + ADD $16, R3 + ADD $-16, R4 + + // Process remaining data chunks + CMP R4 ,$16 + BGE fold_16bytes_loop + +final_fold: + VLEIB $7, $0x40, V9 + VSRLB V9, CONST_R4R3, V0 + VLEIG $0, $1, V0 + + VGFMG V0, V1, V1 + + VLEIB $7, $0x20, V9 // Shift by words + VSRLB V9, V1, V2 // Store remaining bits in V2 + VUPLLF V1, V1 // Split rightmost doubleword + VGFMAG CONST_R5, V1, V2, V1 // V1 = (V1 * R5) XOR V2 + + + // The input values to the Barret reduction are the degree-63 polynomial + // in V1 (R(x)), degree-32 generator polynomial, and the reduction + // constant u. The Barret reduction result is the CRC value of R(x) mod + // P(x). + // + // The Barret reduction algorithm is defined as: + // + // 1. T1(x) = floor( R(x) / x^32 ) GF2MUL u + // 2. T2(x) = floor( T1(x) / x^32 ) GF2MUL P(x) + // 3. C(x) = R(x) XOR T2(x) mod x^32 + // + // Note: To compensate the division by x^32, use the vector unpack + // instruction to move the leftmost word into the leftmost doubleword + // of the vector register. The rightmost doubleword is multiplied + // with zero to not contribute to the intermedate results. + + + // T1(x) = floor( R(x) / x^32 ) GF2MUL u + VUPLLF V1, V2 + VGFMG CONST_RU_POLY, V2, V2 + + + // Compute the GF(2) product of the CRC polynomial in VO with T1(x) in + // V2 and XOR the intermediate result, T2(x), with the value in V1. + // The final result is in the rightmost word of V2. + + VUPLLF V2 , V2 + VGFMAG CONST_CRC_POLY, V2, V1, V2 + +done: + VLGVF $2, V2, R2 + XOR $0xffffffff, R2 // NOTW R2 + MOVWZ R2, ret + 32(FP) + RET