mirror of https://github.com/golang/go.git
cmd/compile/internal/gc: speed-up small array comparison
Currently we inline array comparisons for arrays with at most 4 elements. Compare arrays with small size, but more than 4 elements (e. g. [16]byte) with larger compares. This provides very slightly smaller binaries, and results in faster code. ArrayEqual-6 7.41ns ± 0% 3.17ns ± 0% -57.15% (p=0.000 n=10+10) For go tool: global text (code) = -559 bytes (-0.014566%) This also helps mapaccess1_faststr, and maps in general: MapDelete/Str/1-6 195ns ± 1% 186ns ± 2% -4.47% (p=0.000 n=10+10) MapDelete/Str/2-6 211ns ± 1% 177ns ± 1% -16.01% (p=0.000 n=10+10) MapDelete/Str/4-6 225ns ± 1% 183ns ± 1% -18.49% (p=0.000 n=8+10) MapStringKeysEight_16-6 31.3ns ± 0% 28.6ns ± 0% -8.63% (p=0.000 n=6+9) MapStringKeysEight_32-6 29.2ns ± 0% 27.6ns ± 0% -5.45% (p=0.000 n=10+10) MapStringKeysEight_64-6 29.1ns ± 1% 27.5ns ± 0% -5.46% (p=0.000 n=10+10) MapStringKeysEight_1M-6 29.1ns ± 1% 27.6ns ± 0% -5.49% (p=0.000 n=10+10) Change-Id: I9ec98e41b233031e0e96c4e13d86a324f628ed4a Reviewed-on: https://go-review.googlesource.com/40771 Run-TryBot: Ilya Tocar <ilya.tocar@intel.com> TryBot-Result: Gobot Gobot <gobot@golang.org> Reviewed-by: Keith Randall <khr@golang.org>
This commit is contained in:
parent
2f73efa971
commit
3bdc2f3abf
|
|
@ -166,7 +166,7 @@ var allAsmTests = []*asmTests{
|
|||
{
|
||||
arch: "amd64",
|
||||
os: "linux",
|
||||
imports: []string{"encoding/binary", "math/bits"},
|
||||
imports: []string{"encoding/binary", "math/bits", "unsafe"},
|
||||
tests: linuxAMD64Tests,
|
||||
},
|
||||
{
|
||||
|
|
@ -869,6 +869,35 @@ var linuxAMD64Tests = []*asmTest{
|
|||
}`,
|
||||
[]string{"\tRORB\t"},
|
||||
},
|
||||
// Check that array compare uses 2/4/8 byte compares
|
||||
{
|
||||
`
|
||||
func f68(a,b [2]byte) bool {
|
||||
return a == b
|
||||
}`,
|
||||
[]string{"\tCMPW\t[A-Z]"},
|
||||
},
|
||||
{
|
||||
`
|
||||
func f69(a,b [3]uint16) bool {
|
||||
return a == b
|
||||
}`,
|
||||
[]string{"\tCMPL\t[A-Z]"},
|
||||
},
|
||||
{
|
||||
`
|
||||
func f70(a,b [15]byte) bool {
|
||||
return a == b
|
||||
}`,
|
||||
[]string{"\tCMPQ\t[A-Z]"},
|
||||
},
|
||||
{
|
||||
`
|
||||
func f71(a,b unsafe.Pointer) bool { // This was a TODO in mapaccess1_faststr
|
||||
return *((*[4]byte)(a)) != *((*[4]byte)(b))
|
||||
}`,
|
||||
[]string{"\tCMPL\t[A-Z]"},
|
||||
},
|
||||
}
|
||||
|
||||
var linux386Tests = []*asmTest{
|
||||
|
|
|
|||
|
|
@ -3243,11 +3243,25 @@ func walkcompare(n *Node, init *Nodes) *Node {
|
|||
// inline or call an eq alg.
|
||||
t := n.Left.Type
|
||||
var inline bool
|
||||
|
||||
maxcmpsize := int64(4)
|
||||
unalignedLoad := false
|
||||
switch thearch.LinkArch.Family {
|
||||
case sys.AMD64, sys.ARM64, sys.S390X:
|
||||
// Keep this low enough, to generate less code than function call.
|
||||
maxcmpsize = 16
|
||||
unalignedLoad = true
|
||||
case sys.I386:
|
||||
maxcmpsize = 8
|
||||
unalignedLoad = true
|
||||
}
|
||||
|
||||
switch t.Etype {
|
||||
default:
|
||||
return n
|
||||
case TARRAY:
|
||||
inline = t.NumElem() <= 1 || (t.NumElem() <= 4 && issimple[t.Elem().Etype])
|
||||
// We can compare several elements at once with 2/4/8 byte integer compares
|
||||
inline = t.NumElem() <= 1 || (issimple[t.Elem().Etype] && (t.NumElem() <= 4 || t.Elem().Width*t.NumElem() <= maxcmpsize))
|
||||
case TSTRUCT:
|
||||
inline = t.NumFields() <= 4
|
||||
}
|
||||
|
|
@ -3333,11 +3347,54 @@ func walkcompare(n *Node, init *Nodes) *Node {
|
|||
)
|
||||
}
|
||||
} else {
|
||||
for i := 0; int64(i) < t.NumElem(); i++ {
|
||||
compare(
|
||||
nod(OINDEX, cmpl, nodintconst(int64(i))),
|
||||
nod(OINDEX, cmpr, nodintconst(int64(i))),
|
||||
)
|
||||
step := int64(1)
|
||||
remains := t.NumElem() * t.Elem().Width
|
||||
combine64bit := unalignedLoad && Widthreg == 8 && t.Elem().Width <= 4 && t.Elem().IsInteger()
|
||||
combine32bit := unalignedLoad && t.Elem().Width <= 2 && t.Elem().IsInteger()
|
||||
combine16bit := unalignedLoad && t.Elem().Width == 1 && t.Elem().IsInteger()
|
||||
for i := int64(0); remains > 0; {
|
||||
var convType *types.Type
|
||||
switch {
|
||||
case remains >= 8 && combine64bit:
|
||||
convType = types.Types[TINT64]
|
||||
step = 8 / t.Elem().Width
|
||||
case remains >= 4 && combine32bit:
|
||||
convType = types.Types[TUINT32]
|
||||
step = 4 / t.Elem().Width
|
||||
case remains >= 2 && combine16bit:
|
||||
convType = types.Types[TUINT16]
|
||||
step = 2 / t.Elem().Width
|
||||
default:
|
||||
step = 1
|
||||
}
|
||||
if step == 1 {
|
||||
compare(
|
||||
nod(OINDEX, cmpl, nodintconst(int64(i))),
|
||||
nod(OINDEX, cmpr, nodintconst(int64(i))),
|
||||
)
|
||||
i++
|
||||
remains -= t.Elem().Width
|
||||
} else {
|
||||
cmplw := nod(OINDEX, cmpl, nodintconst(int64(i)))
|
||||
cmplw = conv(cmplw, convType)
|
||||
cmprw := nod(OINDEX, cmpr, nodintconst(int64(i)))
|
||||
cmprw = conv(cmprw, convType)
|
||||
// For code like this: uint32(s[0]) | uint32(s[1])<<8 | uint32(s[2])<<16 ...
|
||||
// ssa will generate a single large load.
|
||||
for offset := int64(1); offset < step; offset++ {
|
||||
lb := nod(OINDEX, cmpl, nodintconst(int64(i+offset)))
|
||||
lb = conv(lb, convType)
|
||||
lb = nod(OLSH, lb, nodintconst(int64(8*t.Elem().Width*offset)))
|
||||
cmplw = nod(OOR, cmplw, lb)
|
||||
rb := nod(OINDEX, cmpr, nodintconst(int64(i+offset)))
|
||||
rb = conv(rb, convType)
|
||||
rb = nod(OLSH, rb, nodintconst(int64(8*t.Elem().Width*offset)))
|
||||
cmprw = nod(OOR, cmprw, rb)
|
||||
}
|
||||
compare(cmplw, cmprw)
|
||||
i += step
|
||||
remains -= step * t.Elem().Width
|
||||
}
|
||||
}
|
||||
}
|
||||
if expr == nil {
|
||||
|
|
|
|||
|
|
@ -252,8 +252,6 @@ func mapaccess1_faststr(t *maptype, h *hmap, ky string) unsafe.Pointer {
|
|||
return add(unsafe.Pointer(b), dataOffset+bucketCnt*2*sys.PtrSize+i*uintptr(t.valuesize))
|
||||
}
|
||||
// check first 4 bytes
|
||||
// TODO: on amd64/386 at least, make this compile to one 4-byte comparison instead of
|
||||
// four 1-byte comparisons.
|
||||
if *((*[4]byte)(key.str)) != *((*[4]byte)(k.str)) {
|
||||
continue
|
||||
}
|
||||
|
|
|
|||
Loading…
Reference in New Issue