runtime: fix alignment code in memclr_riscv64.s

The existing code incorrectly determines whether the pointer passed to
memclrNoHeapPointers is 8 byte aligned (it currently checks to see whether
it's 4 byte aligned).

In addition, the code that aligns the pointer, by individually filling
the first few bytes of the buffer with zeros, is also incorrect.  It adjusts
the pointer by the wrong number of bytes, resulting in most cases, in
an unaligned pointer.

This commit fixes both of these issues by anding the pointer with 7
rather than 3 to determine its alignment, and by individually filling
the first (8 - (pointer & 7)) bytes with 0 to align the buffer, rather
than the first (pointer & 3) bytes.

We also remove an unnecessary immediate MOV instruction.

A new benchmark is added to test the performance of memclrNoHeapPointers
on non-aligned pointers.  Results of the existing and the new benchmark
on a SiFive HiFive Unmatched A00 with 16GB of RAM running Ubuntu 23.04
are presented below.

Memclr/5-4                     21.98n ± 7%   22.66n ± 9%        ~ (p=0.079 n=10)
Memclr/16-4                    20.85n ± 3%   21.09n ± 5%        ~ (p=0.796 n=10)
Memclr/64-4                    28.20n ± 4%   27.50n ± 3%        ~ (p=0.093 n=10)
Memclr/256-4                   53.66n ± 8%   53.44n ± 8%        ~ (p=0.280 n=10)
Memclr/4096-4                  522.6n ± 1%   523.4n ± 1%        ~ (p=0.240 n=10)
Memclr/65536-4                 24.17µ ± 0%   24.13µ ± 0%   -0.19% (p=0.029 n=10)
Memclr/1M-4                    446.9µ ± 0%   446.9µ ± 0%        ~ (p=0.684 n=10)
Memclr/4M-4                    12.69m ± 2%   12.79m ± 3%   +0.78% (p=0.043 n=10)
Memclr/8M-4                    29.75m ± 0%   29.76m ± 0%   +0.03% (p=0.015 n=10)
Memclr/16M-4                   60.34m ± 0%   60.32m ± 0%        ~ (p=0.247 n=10)
Memclr/64M-4                   241.2m ± 0%   241.3m ± 0%        ~ (p=0.247 n=10)
MemclrUnaligned/0_5-4          27.71n ± 0%   27.72n ± 1%        ~ (p=0.142 n=10)
MemclrUnaligned/0_16-4         26.95n ± 0%   26.04n ± 0%   -3.38% (p=0.000 n=10)
MemclrUnaligned/0_64-4         38.27n ± 4%   40.15n ± 6%   +4.89% (p=0.005 n=10)
MemclrUnaligned/0_256-4        63.95n ± 3%   64.19n ± 2%        ~ (p=0.971 n=10)
MemclrUnaligned/0_4096-4       532.6n ± 1%   530.9n ± 1%        ~ (p=0.324 n=10)
MemclrUnaligned/0_65536-4      24.30µ ± 0%   24.22µ ± 0%   -0.32% (p=0.023 n=10)
MemclrUnaligned/1_5-4          29.40n ± 0%   29.39n ± 0%        ~ (p=0.060 n=10)
MemclrUnaligned/1_16-4        632.65n ± 1%   63.80n ± 2%  -89.92% (p=0.000 n=10)
MemclrUnaligned/1_64-4       4091.00n ± 1%   73.23n ± 1%  -98.21% (p=0.000 n=10)
MemclrUnaligned/1_256-4     17803.50n ± 1%   92.03n ± 1%  -99.48% (p=0.000 n=10)
MemclrUnaligned/1_4096-4    294150.0n ± 1%   561.9n ± 1%  -99.81% (p=0.000 n=10)
MemclrUnaligned/1_65536-4    4692.80µ ± 1%   24.44µ ± 0%  -99.48% (p=0.000 n=10)
MemclrUnaligned/4_5-4          27.71n ± 0%   27.71n ± 0%        ~ (p=0.308 n=10)
MemclrUnaligned/4_16-4       1187.00n ± 1%   50.74n ± 3%  -95.72% (p=0.000 n=10)
MemclrUnaligned/4_64-4       4617.00n ± 1%   59.89n ± 2%  -98.70% (p=0.000 n=10)
MemclrUnaligned/4_256-4     18472.50n ± 1%   84.76n ± 2%  -99.54% (p=0.000 n=10)
MemclrUnaligned/4_4096-4    292904.0n ± 1%   553.7n ± 0%  -99.81% (p=0.000 n=10)
MemclrUnaligned/4_65536-4    4716.12µ ± 0%   24.38µ ± 0%  -99.48% (p=0.000 n=10)
MemclrUnaligned/7_5-4          29.39n ± 0%   29.39n ± 0%        ~ (p=1.000 n=10)
MemclrUnaligned/7_16-4        636.80n ± 1%   48.33n ± 5%  -92.41% (p=0.000 n=10)
MemclrUnaligned/7_64-4       4094.00n ± 1%   58.88n ± 3%  -98.56% (p=0.000 n=10)
MemclrUnaligned/7_256-4     17869.00n ± 2%   82.70n ± 3%  -99.54% (p=0.000 n=10)
MemclrUnaligned/7_4096-4    294110.5n ± 1%   554.6n ± 1%  -99.81% (p=0.000 n=10)
MemclrUnaligned/7_65536-4    4735.00µ ± 1%   24.28µ ± 0%  -99.49% (p=0.000 n=10)
MemclrUnaligned/0_1M-4         447.8µ ± 0%   450.0µ ± 1%   +0.51% (p=0.000 n=10)
MemclrUnaligned/0_4M-4         12.68m ± 1%   12.64m ± 2%   -0.33% (p=0.015 n=10)
MemclrUnaligned/0_8M-4         29.76m ± 0%   29.79m ± 2%        ~ (p=0.075 n=10)
MemclrUnaligned/0_16M-4        60.34m ± 1%   60.49m ± 1%        ~ (p=0.353 n=10)
MemclrUnaligned/0_64M-4        241.3m ± 0%   241.4m ± 0%        ~ (p=0.247 n=10)
MemclrUnaligned/1_1M-4       75937.3µ ± 1%   449.9µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/1_4M-4        313.96m ± 2%   12.69m ± 0%  -95.96% (p=0.000 n=10)
MemclrUnaligned/1_8M-4        630.97m ± 1%   29.76m ± 0%  -95.28% (p=0.000 n=10)
MemclrUnaligned/1_16M-4      1263.47m ± 1%   60.35m ± 2%  -95.22% (p=0.000 n=10)
MemclrUnaligned/1_64M-4       5053.5m ± 0%   241.3m ± 0%  -95.23% (p=0.000 n=10)
MemclrUnaligned/4_1M-4       75880.5µ ± 2%   446.5µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/4_4M-4        314.00m ± 1%   12.71m ± 2%  -95.95% (p=0.000 n=10)
MemclrUnaligned/4_8M-4        630.63m ± 1%   29.77m ± 2%  -95.28% (p=0.000 n=10)
MemclrUnaligned/4_16M-4      1257.80m ± 0%   60.34m ± 2%  -95.20% (p=0.000 n=10)
MemclrUnaligned/4_64M-4       5041.3m ± 1%   241.2m ± 0%  -95.21% (p=0.000 n=10)
MemclrUnaligned/7_1M-4       75866.2µ ± 1%   446.9µ ± 0%  -99.41% (p=0.000 n=10)
MemclrUnaligned/7_4M-4        309.86m ± 1%   12.70m ± 1%  -95.90% (p=0.000 n=10)
MemclrUnaligned/7_8M-4        626.67m ± 1%   29.75m ± 2%  -95.25% (p=0.000 n=10)
MemclrUnaligned/7_16M-4      1252.84m ± 1%   60.31m ± 0%  -95.19% (p=0.000 n=10)
MemclrUnaligned/7_64M-4       5015.8m ± 1%   241.4m ± 0%  -95.19% (p=0.000 n=10)
geomean                        339.1µ        35.83µ       -89.43%

Change-Id: I3b958a1d8e8f5ef205052e6b985a5ce21e92ef85
Reviewed-on: https://go-review.googlesource.com/c/go/+/496455
Run-TryBot: Joel Sing <joel@sing.id.au>
Reviewed-by: Joel Sing <joel@sing.id.au>
Reviewed-by: Keith Randall <khr@golang.org>
Reviewed-by: Matthew Dempsky <mdempsky@google.com>
Reviewed-by: M Zhuo <mzh@golangcn.org>
Reviewed-by: Keith Randall <khr@google.com>
TryBot-Result: Gopher Robot <gobot@golang.org>
This commit is contained in:
Mark Ryan 2023-05-19 14:00:10 +02:00 committed by Joel Sing
parent 2a8969cb36
commit a4772a1a59
2 changed files with 29 additions and 2 deletions

View File

@ -16,10 +16,11 @@ TEXT runtime·memclrNoHeapPointers<ABIInternal>(SB),NOSPLIT,$0-16
BLT X11, X9, check4
// Check alignment
AND $3, X10, X5
AND $7, X10, X5
BEQZ X5, aligned
// Zero one byte at a time until we reach 8 byte alignment.
SUB X5, X9, X5
SUB X5, X11, X11
align:
ADD $-1, X5
@ -28,7 +29,7 @@ align:
BNEZ X5, align
aligned:
MOV $8, X9
// X9 already contains $8
BLT X11, X9, check4
MOV $16, X9
BLT X11, X9, zero8

View File

@ -400,6 +400,32 @@ func BenchmarkMemclr(b *testing.B) {
}
}
func BenchmarkMemclrUnaligned(b *testing.B) {
for _, off := range []int{0, 1, 4, 7} {
for _, n := range []int{5, 16, 64, 256, 4096, 65536} {
x := make([]byte, n+off)
b.Run(fmt.Sprint(off, n), func(b *testing.B) {
b.SetBytes(int64(n))
for i := 0; i < b.N; i++ {
MemclrBytes(x[off:])
}
})
}
}
for _, off := range []int{0, 1, 4, 7} {
for _, m := range []int{1, 4, 8, 16, 64} {
x := make([]byte, (m<<20)+off)
b.Run(fmt.Sprint(off, m, "M"), func(b *testing.B) {
b.SetBytes(int64(m << 20))
for i := 0; i < b.N; i++ {
MemclrBytes(x[off:])
}
})
}
}
}
func BenchmarkGoMemclr(b *testing.B) {
benchmarkSizes(b, []int{5, 16, 64, 256}, func(b *testing.B, n int) {
x := make([]byte, n)