diff --git a/src/hash/crc32/crc32_amd64.s b/src/hash/crc32/crc32_amd64.s index caacfae21d..a775a194df 100644 --- a/src/hash/crc32/crc32_amd64.s +++ b/src/hash/crc32/crc32_amd64.s @@ -12,40 +12,79 @@ TEXT ·castagnoliSSE42(SB),NOSPLIT,$0 NOTL AX - /* If there's less than 8 bytes to process, we do it byte-by-byte. */ + // If there are fewer than 8 bytes to process, skip alignment. CMPQ CX, $8 - JL cleanup + JL less_than_8 - /* Process individual bytes until the input is 8-byte aligned. */ -startup: MOVQ SI, BX ANDQ $7, BX JZ aligned + // Process the first few bytes to 8-byte align the input. + + // BX = 8 - BX. We need to process this many bytes to align. + SUBQ $1, BX + XORQ $7, BX + + BTQ $0, BX + JNC align_2 + CRC32B (SI), AX DECQ CX INCQ SI - JMP startup + +align_2: + BTQ $1, BX + JNC align_4 + + // CRC32W (SI), AX + BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + + SUBQ $2, CX + ADDQ $2, SI + +align_4: + BTQ $2, BX + JNC aligned + + // CRC32L (SI), AX + BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + + SUBQ $4, CX + ADDQ $4, SI aligned: - /* The input is now 8-byte aligned and we can process 8-byte chunks. */ + // The input is now 8-byte aligned and we can process 8-byte chunks. CMPQ CX, $8 - JL cleanup + JL less_than_8 CRC32Q (SI), AX ADDQ $8, SI SUBQ $8, CX JMP aligned -cleanup: - /* We may have some bytes left over that we process one at a time. */ - CMPQ CX, $0 - JE done +less_than_8: + // We may have some bytes left over; process 4 bytes, then 2, then 1. + BTQ $2, CX + JNC less_than_4 + + // CRC32L (SI), AX + BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + ADDQ $4, SI + +less_than_4: + BTQ $1, CX + JNC less_than_2 + + // CRC32W (SI), AX + BYTE $0x66; BYTE $0xf2; BYTE $0x0f; BYTE $0x38; BYTE $0xf1; BYTE $0x06 + ADDQ $2, SI + +less_than_2: + BTQ $0, CX + JNC done CRC32B (SI), AX - INCQ SI - DECQ CX - JMP cleanup done: NOTL AX diff --git a/src/hash/crc32/crc32_test.go b/src/hash/crc32/crc32_test.go index e2b3557828..067c42adf0 100644 --- a/src/hash/crc32/crc32_test.go +++ b/src/hash/crc32/crc32_test.go @@ -67,56 +67,68 @@ func TestGolden(t *testing.T) { t.Errorf("Castagnoli(%s) = 0x%x want 0x%x", g.in, s, g.castagnoli) } - if len(g.in) > 0 { - // The SSE4.2 implementation of this has code to deal - // with misaligned data so we ensure that we test that - // too. - castagnoli = New(castagnoliTab) - io.WriteString(castagnoli, g.in[:1]) - io.WriteString(castagnoli, g.in[1:]) - s = castagnoli.Sum32() - if s != g.castagnoli { - t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli) + // The SSE4.2 implementation of this has code to deal + // with misaligned data so we ensure that we test that + // too. + for delta := 1; delta <= 7; delta++ { + if len(g.in) > delta { + in := []byte(g.in) + castagnoli = New(castagnoliTab) + castagnoli.Write(in[:delta]) + castagnoli.Write(in[delta:]) + s = castagnoli.Sum32() + if s != g.castagnoli { + t.Errorf("Castagnoli[misaligned](%s) = 0x%x want 0x%x", g.in, s, g.castagnoli) + } } } } } func BenchmarkIEEECrc40B(b *testing.B) { - benchmark(b, NewIEEE(), 40) + benchmark(b, NewIEEE(), 40, 0) } func BenchmarkIEEECrc1KB(b *testing.B) { - benchmark(b, NewIEEE(), 1<<10) + benchmark(b, NewIEEE(), 1<<10, 0) } func BenchmarkIEEECrc4KB(b *testing.B) { - benchmark(b, NewIEEE(), 4<<10) + benchmark(b, NewIEEE(), 4<<10, 0) } func BenchmarkIEEECrc32KB(b *testing.B) { - benchmark(b, NewIEEE(), 32<<10) + benchmark(b, NewIEEE(), 32<<10, 0) +} + +func BenchmarkCastagnoliCrc15B(b *testing.B) { + benchmark(b, New(MakeTable(Castagnoli)), 15, 0) +} + +func BenchmarkCastagnoliCrc15BMisaligned(b *testing.B) { + benchmark(b, New(MakeTable(Castagnoli)), 15, 1) } func BenchmarkCastagnoliCrc40B(b *testing.B) { - benchmark(b, New(MakeTable(Castagnoli)), 40) + benchmark(b, New(MakeTable(Castagnoli)), 40, 0) } func BenchmarkCastagnoliCrc1KB(b *testing.B) { - benchmark(b, New(MakeTable(Castagnoli)), 1<<10) + benchmark(b, New(MakeTable(Castagnoli)), 1<<10, 0) } func BenchmarkCastagnoliCrc4KB(b *testing.B) { - benchmark(b, New(MakeTable(Castagnoli)), 4<<10) + benchmark(b, New(MakeTable(Castagnoli)), 4<<10, 0) } func BenchmarkCastagnoliCrc32KB(b *testing.B) { - benchmark(b, New(MakeTable(Castagnoli)), 32<<10) + benchmark(b, New(MakeTable(Castagnoli)), 32<<10, 0) } -func benchmark(b *testing.B, h hash.Hash32, n int64) { +func benchmark(b *testing.B, h hash.Hash32, n, alignment int64) { b.SetBytes(n) - data := make([]byte, n) + data := make([]byte, n+alignment) + data = data[alignment:] for i := range data { data[i] = byte(i) }