runtime: use hardware divider to improve performance

The hardware divider is an optional component of ARMv7. This patch
detects whether it is available in runtime and use it or not.

1. The hardware divider is detected at startup and a flag is set/clear
   according to a perticular bit of runtime.hwcap.
2. Each call of runtime.udiv will check this flag and decide if
   use the hardware division instruction.

A rough test shows the performance improves 40-50% for ARMv7. And
the compatibility of ARMv5/v6 is not broken.

fixes #19118

Change-Id: Ic586bc9659ebc169553ca2004d2bdb721df823ac
Reviewed-on: https://go-review.googlesource.com/37496
Run-TryBot: Cherry Zhang <cherryyz@google.com>
TryBot-Result: Gobot Gobot <gobot@golang.org>
Reviewed-by: Cherry Zhang <cherryyz@google.com>
This commit is contained in:
Ben Shi 2017-02-27 07:56:57 +00:00 committed by Cherry Zhang
parent 2a8d99e427
commit 69261ecad6
16 changed files with 88 additions and 0 deletions

View File

@ -12,6 +12,7 @@
// int8_t DidInitRun();
// int8_t DidMainRun();
// int32_t FromPkg();
// uint32_t Divu(uint32_t, uint32_t);
int main(void) {
int8_t ran_init = DidInitRun();
if (!ran_init) {
@ -30,6 +31,11 @@ int main(void) {
fprintf(stderr, "ERROR: FromPkg=%d, want %d\n", from_pkg, 1024);
return 1;
}
uint32_t divu = Divu(2264, 31);
if (divu != 73) {
fprintf(stderr, "ERROR: Divu(2264, 31)=%d, want %d\n", divu, 73);
return 1;
}
// test.bash looks for "PASS" to ensure this program has reached the end.
printf("PASS\n");
return 0;

View File

@ -8,3 +8,5 @@ import "C"
//export FromPkg
func FromPkg() int32 { return 1024 }
//export Divu
func Divu(a, b uint32) uint32 { return a / b }

View File

@ -400,6 +400,12 @@ func TestTrivialExecutablePIE(t *testing.T) {
AssertHasRPath(t, "./trivial.pie", gorootInstallDir)
}
// Build a division test program and check it runs.
func TestDivisionExecutable(t *testing.T) {
goCmd(t, "install", "-linkshared", "division")
run(t, "division executable", "./bin/division")
}
// Build an executable that uses cgo linked against the shared runtime and check it
// runs.
func TestCgoExecutable(t *testing.T) {

View File

@ -0,0 +1,17 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package main
//go:noinline
func div(x, y uint32) uint32 {
return x / y
}
func main() {
a := div(97, 11)
if a != 8 {
panic("FAIL")
}
}

View File

@ -965,6 +965,13 @@ jmp_label_3:
REVSH R1, R2 // b12fffe6
RBIT R1, R2 // 312fffe6
// DIVHW R0, R1, R2: R1 / R0 -> R2
DIVHW R0, R1, R2 // 11f012e7
DIVUHW R0, R1, R2 // 11f032e7
// DIVHW R0, R1: R1 / R0 -> R1
DIVHW R0, R1 // 11f011e7
DIVUHW R0, R1 // 11f031e7
//
// END
//

View File

@ -247,6 +247,8 @@ const (
ADIV
AMOD
AMODU
ADIVHW
ADIVUHW
AMOVB
AMOVBS

View File

@ -71,6 +71,8 @@ var Anames = []string{
"DIV",
"MOD",
"MODU",
"DIVHW",
"DIVUHW",
"MOVB",
"MOVBS",
"MOVBU",

View File

@ -142,6 +142,8 @@ var optab = []Optab{
{AMUL, C_REG, C_NONE, C_REG, 15, 4, 0, 0, 0},
{ADIV, C_REG, C_REG, C_REG, 16, 4, 0, 0, 0},
{ADIV, C_REG, C_NONE, C_REG, 16, 4, 0, 0, 0},
{ADIVHW, C_REG, C_REG, C_REG, 105, 4, 0, 0, 0},
{ADIVHW, C_REG, C_NONE, C_REG, 105, 4, 0, 0, 0},
{AMULL, C_REG, C_REG, C_REGREG, 17, 4, 0, 0, 0},
{AMULA, C_REG, C_REG, C_REGREG2, 17, 4, 0, 0, 0},
{AMOVW, C_REG, C_NONE, C_SAUTO, 20, 4, REGSP, 0, 0},
@ -1401,6 +1403,9 @@ func buildop(ctxt *obj.Link) {
opset(AMODU, r0)
opset(ADIVU, r0)
case ADIVHW:
opset(ADIVUHW, r0)
case AMOVW,
AMOVB,
AMOVBS,
@ -2407,6 +2412,16 @@ func (c *ctxt5) asmout(p *obj.Prog, o *Optab, out []uint32) {
if p.As == ADATABUNDLE {
o1 = 0xe125be70
}
case 105: /* divhw r,[r,]r */
o1 = c.oprrr(p, p.As, int(p.Scond))
rf := int(p.From.Reg)
rt := int(p.To.Reg)
r := int(p.Reg)
if r == 0 {
r = rt
}
o1 |= (uint32(rf)&15)<<8 | (uint32(r)&15)<<0 | (uint32(rt)&15)<<16
}
out[0] = o1
@ -2445,6 +2460,10 @@ func (c *ctxt5) oprrr(p *obj.Prog, a obj.As, sc int) uint32 {
c.ctxt.Diag(".nil/.W on dp instruction")
}
switch a {
case ADIVHW:
return o | 0x71<<20 | 0xf<<12 | 0x1<<4
case ADIVUHW:
return o | 0x73<<20 | 0xf<<12 | 0x1<<4
case AMMUL:
return o | 0x75<<20 | 0xf<<12 | 0x1<<4
case AMULS:

View File

@ -4,6 +4,8 @@
package runtime
var hardDiv bool // TODO: set if a hardware divider is available
func checkgoarm() {
// TODO(minux): FP checks like in os_linux_arm.go.

View File

@ -4,6 +4,8 @@
package runtime
var hardDiv bool // TODO: set if a hardware divider is available
func checkgoarm() {
// TODO(minux): FP checks like in os_linux_arm.go.

View File

@ -11,11 +11,13 @@ const (
_HWCAP_VFP = 1 << 6 // introduced in at least 2.6.11
_HWCAP_VFPv3 = 1 << 13 // introduced in 2.6.30
_HWCAP_IDIVA = 1 << 17
)
var randomNumber uint32
var armArch uint8 = 6 // we default to ARMv6
var hwcap uint32 // set by setup_auxv
var hardDiv bool // set if a hardware divider is available
func checkgoarm() {
// On Android, /proc/self/auxv might be unreadable and hwcap won't
@ -53,6 +55,7 @@ func archauxv(tag, val uintptr) {
case _AT_HWCAP: // CPU capability bit flags
hwcap = uint32(val)
hardDiv = (hwcap & _HWCAP_IDIVA) != 0
}
}

View File

@ -4,6 +4,8 @@
package runtime
var hardDiv bool // TODO: set if a hardware divider is available
func checkgoarm() {
// TODO(minux): FP checks like in os_linux_arm.go.

View File

@ -6,6 +6,8 @@ package runtime
import "unsafe"
var hardDiv bool // TODO: set if a hardware divider is available
func lwp_mcontext_init(mc *mcontextt, stk unsafe.Pointer, mp *m, gp *g, fn uintptr) {
// Machine dependent mcontext initialisation for LWP.
mc.__gregs[_REG_R15] = uint32(funcPC(lwp_tramp))

View File

@ -4,6 +4,8 @@
package runtime
var hardDiv bool // TODO: set if a hardware divider is available
func checkgoarm() {
// TODO(minux): FP checks like in os_linux_arm.go.

View File

@ -4,6 +4,8 @@
package runtime
var hardDiv bool // TODO: set if a hardware divider is available
func checkgoarm() {
return // TODO(minux)
}

View File

@ -119,6 +119,10 @@ TEXT runtime·_sfloatpanic(SB),NOSPLIT,$-4
// Be careful: Ra == R11 will be used by the linker for synthesized instructions.
TEXT udiv(SB),NOSPLIT,$-4
MOVBU runtime·hardDiv(SB), Ra
CMP $0, Ra
BNE udiv_hardware
CLZ Rq, Rs // find normalizing shift
MOVW.S Rq<<Rs, Ra
MOVW $fast_udiv_tab<>-64(SB), RM
@ -154,6 +158,14 @@ TEXT udiv(SB),NOSPLIT,$-4
ADD.PL $2, Rq
RET
// use hardware divider
udiv_hardware:
DIVUHW Rq, Rr, Rs
MUL Rs, Rq, RM
RSB Rr, RM, Rr
MOVW Rs, Rq
RET
udiv_by_large_d:
// at this point we know d>=2^(31-6)=2^25
SUB $4, Ra, Ra