syscall: add support for setns after fork

This adds a Namespaces field to Linux's SysProcAttr type.
When set, these namespaces will be entered after fork and before exec.

This allows users to exec a new process in a pre-defined set of
namespaces without having to resort to hacks or re-execs to bootstrap
these namespaces.

Closes #56680
This commit is contained in:
Brian Goff 2023-10-29 18:40:32 +00:00
parent 1ae729e6d3
commit 7fa12e8417
12 changed files with 259 additions and 0 deletions

View File

@ -105,6 +105,22 @@ type SysProcAttr struct {
// functionality is supported by the kernel, or -1. Note *PidFD is
// changed only if the process starts successfully.
PidFD *int
// JoinNamespaces to join after fork and before exec. Namespaces are joined
// before any unshare calls. If you are using CloneFlags note that those
// flags will be used to do the initial fork, so they occur before joining
// these namespaces. It is expected that the caller has sorted the list in
// the order they want to join. It is possible for ordering to affect
// permissions to join other namespaces.
JoinNamespaces []LinuxNamespace
}
// LinuxNamespace represents a Linux namespace that can be joined by a process.
// See [SysProcAttr.Namespaces].
type LinuxNamespace struct {
// Type of namespace that FD refers to.
Type int
// FD is the file descriptor referring to the namespace.
FD int
}
var (
@ -352,6 +368,18 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att
}
}
// TODO: I think pid namespaces require some custom handling here.
for _, ns := range sys.JoinNamespaces {
_, _, err1 = RawSyscall(_SYS_setns, uintptr(ns.FD), uintptr(ns.Type), 0)
if err1 != 0 {
goto childerror
}
_, _, err1 = RawSyscall(SYS_CLOSE, uintptr(ns.FD), 0, 0)
if err1 != 0 {
goto childerror
}
}
// Wait for User ID/Group ID mappings to be written.
if sys.UidMappings != nil || sys.GidMappings != nil {
if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 {

View File

@ -477,6 +477,227 @@ func TestUseCgroupFD(t *testing.T) {
}
}
func getProcessNamespaces(pid int) (map[string]string, error) {
ls, err := os.ReadDir(fmt.Sprintf("/proc/%d/ns", pid))
if err != nil {
return nil, err
}
out := make(map[string]string)
for _, ns := range ls {
id, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(pid), "ns", ns.Name()))
if err != nil {
return nil, err
}
out[ns.Name()] = id
}
return out, nil
}
func createNamespaces(t *testing.T, unshareFlags uintptr) map[string]*os.File {
t.Helper()
// Create a new namespace by re-execing the test binary with clone flags set.
// We'll capture the file descriptor of the new namespace and pass it back to the caller.
// As long as the file descriptor is open the namespace will be valid.
cmd := testCmdReexec(t)
cmd.Env = append(cmd.Env, "GO_TEST_CREATE_NAMSPACES=1")
cmd.SysProcAttr = &syscall.SysProcAttr{
Cloneflags: unshareFlags,
}
// The child process will block on stdin so we can read the namespace before it exits.
// Once we close the child will exit
pr, pw := io.Pipe()
cmd.Stdin = pr
defer func() {
// Close pw so the child can exit.
// We don't need this alive anymore after the helper function returns.
pw.Close()
if err := cmd.Wait(); err != nil {
t.Error(err)
}
}()
if err := cmd.Start(); err != nil {
if testenv.SyscallIsNotSupported(err) {
t.Skipf("skipping due to permissions error: %v", err)
}
t.Fatal(err)
}
ls, err := getProcessNamespaces(cmd.Process.Pid)
if err != nil {
t.Fatal(err)
}
origNS, err := getProcessNamespaces(os.Getpid())
if err != nil {
t.Fatal(err)
}
// Open a new file for all namespaces created by the child.
// This would be any namespace that has a different id than the original namespaces.
out := make(map[string]*os.File)
for kind, id := range ls {
if strings.HasSuffix(id, "_for_children") {
// This is not a namespace, but rather a namespace that child proccesses will inherit.
// We aren't interested in this.
continue
}
if origNS[kind] == id {
t.Log("skipping namespace", kind, "as it is the same as the parent")
continue
}
f, err := os.Open(fmt.Sprintf("/proc/%d/ns/%s", cmd.Process.Pid, kind))
if err != nil {
t.Fatal(err)
}
t.Cleanup(func() { f.Close() })
out[kind] = f
}
return out
}
func testCmdReexec(t *testing.T) *exec.Cmd {
t.Helper()
exe, err := os.Executable()
if err != nil {
t.Fatal(err)
}
cmd := testenv.Command(t, exe, "-test.run=^"+t.Name()+"$")
cmd.Env = append(cmd.Environ(), "GO_WANT_HELPER_PROCESS=1")
return cmd
}
func TestJoinNamespaces(t *testing.T) {
testenv.MustHaveExec(t)
if os.Getenv("GO_WANT_HELPER_PROCESS") == "1" {
if os.Getenv("GO_TEST_CREATE_NAMSPACES") == "1" {
// Block on stdin, we don't care about this value.
// Once stdin is closed by the caller that is our signal to exit.
os.Stdin.Read(make([]byte, 1))
os.Exit(0)
}
ls, err := getProcessNamespaces(os.Getpid())
if err != nil {
fmt.Fprintln(os.Stderr, err)
os.Exit(2)
}
for _, ns := range ls {
fmt.Println(ns)
}
os.Exit(0)
}
// Mapping for the namespaces we'll be testing with
testNSKinds := map[string]uintptr{
"mnt": syscall.CLONE_NEWNS,
"uts": syscall.CLONE_NEWUTS,
"net": syscall.CLONE_NEWNET,
}
cmdWithJoins := func(fds map[string]*os.File, ordered ...string) *exec.Cmd {
cmd := testCmdReexec(t)
cmd.SysProcAttr = &syscall.SysProcAttr{}
for _, kind := range ordered {
k, ok := testNSKinds[kind]
if !ok {
panic("unexpected namespace kind: " + kind)
}
f, ok := fds[kind]
if !ok {
panic(fmt.Sprintf("missing namespace fd for %q: %v", kind, fds))
}
cmd.SysProcAttr.JoinNamespaces = append(cmd.SysProcAttr.JoinNamespaces, syscall.LinuxNamespace{
Type: int(k),
FD: int(f.Fd()),
})
}
return cmd
}
ourNsLS, err := getProcessNamespaces(os.Getpid())
if err != nil {
t.Fatal(err)
}
checkNs := func(t *testing.T, created map[string]*os.File, cmdOut []byte) {
out := strings.TrimSpace(string(cmdOut))
for _, v := range strings.Split(out, "\n") {
// link format is like "<kind>:[4026531840]"
// We want to check what kind of ns this refers to
kind, _, ok := strings.Cut(v, ":")
if !ok {
t.Fatalf("unexpected output from child: %q", out)
}
ours, ok := ourNsLS[kind]
if !ok {
t.Fatalf("unexpected namespace kind: %q", kind)
}
if _, ok := created[kind]; ok {
// This is one we should have joined so it should be different from our namespace.
if v == ours {
t.Errorf("subprocess did not join new ns for %q", kind)
}
} else {
// This is one we should not have joined so it should be the same as our namespace.
if v != ours {
t.Errorf("subprocess joined ns for %q", kind)
}
}
}
}
t.Run("one namespace", func(t *testing.T) {
nsFDs := createNamespaces(t, syscall.CLONE_NEWNS)
if _, ok := nsFDs["mnt"]; !ok {
t.Fatal("expected to create a namespace for mnt")
}
out, err := cmdWithJoins(nsFDs, "mnt").CombinedOutput()
if err != nil {
t.Fatalf("cmd failed with err %v, output: %s", err, out)
}
checkNs(t, nsFDs, out)
})
t.Run("multiple namespaces", func(t *testing.T) {
nsFDs := createNamespaces(t, syscall.CLONE_NEWNS|syscall.CLONE_NEWNET|syscall.CLONE_NEWUTS)
for kind := range testNSKinds {
if _, ok := nsFDs[kind]; !ok {
t.Fatalf("expected to create a namespace for %q", kind)
}
}
out, err := cmdWithJoins(nsFDs, "mnt", "net", "uts").CombinedOutput()
if err != nil {
t.Fatalf("cmd failed with err %v, output: %s", err, out)
}
checkNs(t, nsFDs, out)
})
}
func TestCloneTimeNamespace(t *testing.T) {
testenv.MustHaveExec(t)

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
func setTimespec(sec, nsec int64) Timespec {

View File

@ -13,6 +13,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = 308
)
//sys Dup2(oldfd int, newfd int) (err error)

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
func setTimespec(sec, nsec int64) Timespec {

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
//sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
//sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT

View File

@ -15,6 +15,7 @@ const (
_SYS_clone3 = 5435
_SYS_faccessat2 = 5439
_SYS_fchmodat2 = 5452
_SYS_setns = SYS_SETNS
)
//sys Dup2(oldfd int, newfd int) (err error)

View File

@ -13,6 +13,7 @@ const (
_SYS_clone3 = 4435
_SYS_faccessat2 = 4439
_SYS_fchmodat2 = 4452
_SYS_setns = SYS_SETNS
)
func Syscall9(trap, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno)

View File

@ -15,6 +15,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
//sys Dup2(oldfd int, newfd int) (err error)

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
//sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT

View File

@ -11,6 +11,7 @@ const (
_SYS_clone3 = 435
_SYS_faccessat2 = 439
_SYS_fchmodat2 = 452
_SYS_setns = SYS_SETNS
)
//sys Dup2(oldfd int, newfd int) (err error)