diff --git a/src/cmd/internal/obj/ppc64/asm9.go b/src/cmd/internal/obj/ppc64/asm9.go index ea76543963..0243dd4744 100644 --- a/src/cmd/internal/obj/ppc64/asm9.go +++ b/src/cmd/internal/obj/ppc64/asm9.go @@ -229,8 +229,8 @@ var optab = []Optab{ {as: AMOVD, a1: C_LACON, a6: C_REG, type_: 26, size: 8}, {as: AMOVD, a1: C_SOREG, a6: C_REG, type_: 8, size: 4}, {as: AMOVD, a1: C_LOREG, a6: C_REG, type_: 36, size: 8}, - {as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 4}, - {as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 8}, + {as: AMOVD, a1: C_TLS_LE, a6: C_REG, type_: 79, size: 8}, + {as: AMOVD, a1: C_TLS_IE, a6: C_REG, type_: 80, size: 12}, {as: AMOVD, a1: C_TOCADDR, a6: C_REG, type_: 95, size: 8}, {as: AMOVD, a1: C_SPR, a6: C_REG, type_: 66, size: 4}, {as: AMOVD, a1: C_REG, a6: C_ADDR, type_: 74, size: 8}, @@ -2500,18 +2500,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) { if v != 0 { c.ctxt.Diag("illegal indexed instruction\n%v", p) } - if c.ctxt.Flag_shared && r == REG_R13 { - rel := obj.Addrel(c.cursym) - rel.Off = int32(c.pc) - rel.Siz = 4 - // This (and the matching part in the load case - // below) are the only places in the ppc64 toolchain - // that knows the name of the tls variable. Possibly - // we could add some assembly syntax so that the name - // of the variable does not have to be assumed. - rel.Sym = c.ctxt.Lookup("runtime.tls_g") - rel.Type = objabi.R_POWER_TLS - } o1 = AOP_RRR(c.opstorex(p.As), uint32(p.From.Reg), uint32(p.To.Index), uint32(r)) } else { if int32(int16(v)) != v { @@ -2536,13 +2524,6 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) { if v != 0 { c.ctxt.Diag("illegal indexed instruction\n%v", p) } - if c.ctxt.Flag_shared && r == REG_R13 { - rel := obj.Addrel(c.cursym) - rel.Off = int32(c.pc) - rel.Siz = 4 - rel.Sym = c.ctxt.Lookup("runtime.tls_g") - rel.Type = objabi.R_POWER_TLS - } o1 = AOP_RRR(c.oploadx(p.As), uint32(p.To.Reg), uint32(p.From.Index), uint32(r)) } else { if int32(int16(v)) != v { @@ -3511,10 +3492,11 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) { if p.From.Offset != 0 { c.ctxt.Diag("invalid offset against tls var %v", p) } - o1 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), REGZERO, 0) + o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R13, 0) + o2 = AOP_IRR(OP_ADDI, uint32(p.To.Reg), uint32(p.To.Reg), 0) rel := obj.Addrel(c.cursym) rel.Off = int32(c.pc) - rel.Siz = 4 + rel.Siz = 8 rel.Sym = p.From.Sym rel.Type = objabi.R_POWER_TLS_LE @@ -3524,11 +3506,17 @@ func (c *ctxt9) asmout(p *obj.Prog, o *Optab, out []uint32) { } o1 = AOP_IRR(OP_ADDIS, uint32(p.To.Reg), REG_R2, 0) o2 = AOP_IRR(c.opload(AMOVD), uint32(p.To.Reg), uint32(p.To.Reg), 0) + o3 = AOP_RRR(OP_ADD, uint32(p.To.Reg), uint32(p.To.Reg), REG_R13) rel := obj.Addrel(c.cursym) rel.Off = int32(c.pc) rel.Siz = 8 rel.Sym = p.From.Sym rel.Type = objabi.R_POWER_TLS_IE + rel = obj.Addrel(c.cursym) + rel.Off = int32(c.pc) + 8 + rel.Siz = 4 + rel.Sym = p.From.Sym + rel.Type = objabi.R_POWER_TLS case 81: v := c.vregoff(&p.To) diff --git a/src/cmd/internal/objabi/reloctype.go b/src/cmd/internal/objabi/reloctype.go index b241127b4e..ea55fa3b0a 100644 --- a/src/cmd/internal/objabi/reloctype.go +++ b/src/cmd/internal/objabi/reloctype.go @@ -167,8 +167,8 @@ const ( // R_POWER_TLS_LE is used to implement the "local exec" model for tls // access. It resolves to the offset of the thread-local symbol from the - // thread pointer (R13) and inserts this value into the low 16 bits of an - // instruction word. + // thread pointer (R13) and is split against a pair of instructions to + // support a 32 bit displacement. R_POWER_TLS_LE // R_POWER_TLS_IE is used to implement the "initial exec" model for tls access. It @@ -178,10 +178,12 @@ const ( // symbol from the thread pointer (R13)). R_POWER_TLS_IE - // R_POWER_TLS marks an X-form instruction such as "MOVD 0(R13)(R31*1), g" as - // accessing a particular thread-local symbol. It does not affect code generation - // but is used by the system linker when relaxing "initial exec" model code to - // "local exec" model code. + // R_POWER_TLS marks an X-form instruction such as "ADD R3,R13,R4" as completing + // a sequence of GOT-relative relocations to compute a TLS address. This can be + // used by the system linker to to rewrite the GOT-relative TLS relocation into a + // simpler thread-pointer relative relocation. See table 3.26 and 3.28 in the + // ppc64 elfv2 1.4 ABI on this transformation. Likewise, the second argument + // (usually called RB in X-form instructions) is assumed to be R13. R_POWER_TLS // R_ADDRPOWER_DS is similar to R_ADDRPOWER above, but assumes the second diff --git a/src/cmd/link/internal/ppc64/asm.go b/src/cmd/link/internal/ppc64/asm.go index e8e258a1f3..06385026d9 100644 --- a/src/cmd/link/internal/ppc64/asm.go +++ b/src/cmd/link/internal/ppc64/asm.go @@ -418,6 +418,7 @@ func xcoffreloc1(arch *sys.Arch, out *ld.OutBuf, ldr *loader.Loader, s loader.Sy emitReloc(ld.XCOFF_R_TOCU|(0x0F<<8), 2) emitReloc(ld.XCOFF_R_TOCL|(0x0F<<8), 6) case objabi.R_POWER_TLS_LE: + // This only supports 16b relocations. It is fixed up in archreloc. emitReloc(ld.XCOFF_R_TLS_LE|0x0F<<8, 2) case objabi.R_CALLPOWER: if r.Size != 4 { @@ -458,7 +459,10 @@ func elfreloc1(ctxt *ld.Link, out *ld.OutBuf, ldr *loader.Loader, s loader.Sym, case objabi.R_POWER_TLS: out.Write64(uint64(elf.R_PPC64_TLS) | uint64(elfsym)<<32) case objabi.R_POWER_TLS_LE: - out.Write64(uint64(elf.R_PPC64_TPREL16) | uint64(elfsym)<<32) + out.Write64(uint64(elf.R_PPC64_TPREL16_HA) | uint64(elfsym)<<32) + out.Write64(uint64(r.Xadd)) + out.Write64(uint64(sectoff + 4)) + out.Write64(uint64(elf.R_PPC64_TPREL16_LO) | uint64(elfsym)<<32) case objabi.R_POWER_TLS_IE: out.Write64(uint64(elf.R_PPC64_GOT_TPREL16_HA) | uint64(elfsym)<<32) out.Write64(uint64(r.Xadd)) @@ -797,11 +801,25 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade if !target.IsAIX() { return val, nExtReloc, false } - case objabi.R_POWER_TLS, objabi.R_POWER_TLS_LE, objabi.R_POWER_TLS_IE: - // check Outer is nil, Type is TLSBSS? + case objabi.R_POWER_TLS: nExtReloc = 1 - if rt == objabi.R_POWER_TLS_IE { - nExtReloc = 2 // need two ELF relocations, see elfreloc1 + return val, nExtReloc, true + case objabi.R_POWER_TLS_LE, objabi.R_POWER_TLS_IE: + if target.IsAIX() && rt == objabi.R_POWER_TLS_LE { + // Fixup val, an addis/addi pair of instructions, which generate a 32b displacement + // from the threadpointer (R13), into a 16b relocation. XCOFF only supports 16b + // TLS LE relocations. Likewise, verify this is an addis/addi sequence. + const expectedOpcodes = 0x3C00000038000000 + const expectedOpmasks = 0xFC000000FC000000 + if uint64(val)&expectedOpmasks != expectedOpcodes { + ldr.Errorf(s, "relocation for %s+%d is not an addis/addi pair: %16x", ldr.SymName(rs), r.Off(), uint64(val)) + } + nval := (int64(uint32(0x380d0000)) | val&0x03e00000) << 32 // addi rX, r13, $0 + nval |= int64(0x60000000) // nop + val = nval + nExtReloc = 1 + } else { + nExtReloc = 2 } return val, nExtReloc, true case objabi.R_ADDRPOWER, @@ -855,10 +873,26 @@ func archreloc(target *ld.Target, ldr *loader.Loader, syms *ld.ArchSyms, r loade // the TLS. v -= 0x800 } - if int64(int16(v)) != v { + + var o1, o2 uint32 + if int64(int32(v)) != v { ldr.Errorf(s, "TLS offset out of range %d", v) } - return (val &^ 0xffff) | (v & 0xffff), nExtReloc, true + if target.IsBigEndian() { + o1 = uint32(val >> 32) + o2 = uint32(val) + } else { + o1 = uint32(val) + o2 = uint32(val >> 32) + } + + o1 |= uint32(((v + 0x8000) >> 16) & 0xFFFF) + o2 |= uint32(v & 0xFFFF) + + if target.IsBigEndian() { + return int64(o1)<<32 | int64(o2), nExtReloc, true + } + return int64(o2)<<32 | int64(o1), nExtReloc, true } return val, nExtReloc, false diff --git a/src/runtime/race_ppc64le.s b/src/runtime/race_ppc64le.s index b09f37031c..069e4d86dd 100644 --- a/src/runtime/race_ppc64le.s +++ b/src/runtime/race_ppc64le.s @@ -36,9 +36,9 @@ // racecalladdr. // // The sequence used to get the race ctx: -// MOVD runtime·tls_g(SB), R10 // offset to TLS -// MOVD 0(R13)(R10*1), g // R13=TLS for this thread, g = R30 -// MOVD g_racectx(g), R3 // racectx == ThreadState +// MOVD runtime·tls_g(SB), R10 // Address of TLS variable +// MOVD 0(R10), g // g = R30 +// MOVD g_racectx(g), R3 // racectx == ThreadState // func runtime·RaceRead(addr uintptr) // Called from instrumented Go code @@ -137,7 +137,7 @@ TEXT runtime·racewriterangepc1(SB), NOSPLIT, $0-24 // Otherwise, setup goroutine context and invoke racecall. Other arguments already set. TEXT racecalladdr<>(SB), NOSPLIT, $0-0 MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 // goroutine context // Check that addr is within [arenastart, arenaend) or within [racedatastart, racedataend). MOVD runtime·racearenastart(SB), R9 @@ -173,7 +173,7 @@ TEXT runtime·racefuncenter(SB), NOSPLIT, $0-8 // R11 = caller's return address TEXT racefuncenter<>(SB), NOSPLIT, $0-0 MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 // goroutine racectx aka *ThreadState MOVD R8, R4 // caller pc set by caller in R8 // void __tsan_func_enter(ThreadState *thr, void *pc); @@ -185,7 +185,7 @@ TEXT racefuncenter<>(SB), NOSPLIT, $0-0 // Called from Go instrumented code. TEXT runtime·racefuncexit(SB), NOSPLIT, $0-0 MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 // goroutine racectx aka *ThreadState // void __tsan_func_exit(ThreadState *thr); MOVD $__tsan_func_exit(SB), R8 @@ -380,7 +380,7 @@ racecallatomic_data: racecallatomic_ok: // Addr is within the good range, call the atomic function. MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 // goroutine racectx aka *ThreadState MOVD R8, R5 // pc is the function called MOVD (R1), R4 // caller pc from stack @@ -394,7 +394,7 @@ racecallatomic_ignore: MOVD R6, R17 // save the original arg list addr MOVD $__tsan_go_ignore_sync_begin(SB), R8 // func addr to call MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 // goroutine context BL racecall<>(SB) MOVD R15, R8 // restore the original function @@ -402,7 +402,7 @@ racecallatomic_ignore: // Call the atomic function. // racecall will call LLVM race code which might clobber r30 (g) MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_racectx(g), R3 MOVD R8, R4 // pc being called same TODO as above @@ -434,7 +434,7 @@ TEXT racecall<>(SB), NOSPLIT, $0-0 MOVD R10, 16(R1) // C ABI // Get info from the current goroutine MOVD runtime·tls_g(SB), R10 // g offset in TLS - MOVD 0(R13)(R10*1), g // R13 = current TLS + MOVD 0(R10), g MOVD g_m(g), R7 // m for g MOVD R1, R16 // callee-saved, preserved across C call MOVD m_g0(R7), R10 // g0 for m @@ -448,7 +448,7 @@ call: XOR R0, R0 // clear R0 on return from Clang MOVD R16, R1 // restore R1; R16 nonvol in Clang MOVD runtime·tls_g(SB), R10 // find correct g - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD 16(R1), R10 // LR was saved away, restore for return MOVD R10, LR RET @@ -469,7 +469,7 @@ TEXT runtime·racecallbackthunk(SB), NOSPLIT, $-8 // g0 TODO: Don't modify g here since R30 is nonvolatile MOVD g, R9 MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_m(g), R3 MOVD m_p(R3), R3 MOVD p_raceprocctx(R3), R3 @@ -527,7 +527,7 @@ rest: MOVD R4, FIXED_FRAME+8(R1) MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_m(g), R7 MOVD m_g0(R7), R8 @@ -540,7 +540,7 @@ rest: // All registers are clobbered after Go code, reload. MOVD runtime·tls_g(SB), R10 - MOVD 0(R13)(R10*1), g + MOVD 0(R10), g MOVD g_m(g), R7 MOVD m_curg(R7), g // restore g = m->curg diff --git a/src/runtime/tls_ppc64x.s b/src/runtime/tls_ppc64x.s index c697449282..25d796fcc6 100644 --- a/src/runtime/tls_ppc64x.s +++ b/src/runtime/tls_ppc64x.s @@ -29,7 +29,7 @@ TEXT runtime·save_g(SB),NOSPLIT|NOFRAME,$0-0 BEQ nocgo #endif MOVD runtime·tls_g(SB), R31 - MOVD g, 0(R13)(R31*1) + MOVD g, 0(R31) nocgo: RET @@ -45,7 +45,7 @@ nocgo: // NOTE: _cgo_topofstack assumes this only clobbers g (R30), and R31. TEXT runtime·load_g(SB),NOSPLIT|NOFRAME,$0-0 MOVD runtime·tls_g(SB), R31 - MOVD 0(R13)(R31*1), g + MOVD 0(R31), g RET GLOBL runtime·tls_g+0(SB), TLSBSS+DUPOK, $8