From 7fa12e8417cfdc19aa5c93b5320b1c926a56c4a1 Mon Sep 17 00:00:00 2001 From: Brian Goff Date: Sun, 29 Oct 2023 18:40:32 +0000 Subject: [PATCH] syscall: add support for setns after fork This adds a Namespaces field to Linux's SysProcAttr type. When set, these namespaces will be entered after fork and before exec. This allows users to exec a new process in a pre-defined set of namespaces without having to resort to hacks or re-execs to bootstrap these namespaces. Closes #56680 --- src/syscall/exec_linux.go | 28 ++++ src/syscall/exec_linux_test.go | 221 +++++++++++++++++++++++++++ src/syscall/syscall_linux_386.go | 1 + src/syscall/syscall_linux_amd64.go | 1 + src/syscall/syscall_linux_arm.go | 1 + src/syscall/syscall_linux_arm64.go | 1 + src/syscall/syscall_linux_loong64.go | 1 + src/syscall/syscall_linux_mips64x.go | 1 + src/syscall/syscall_linux_mipsx.go | 1 + src/syscall/syscall_linux_ppc64x.go | 1 + src/syscall/syscall_linux_riscv64.go | 1 + src/syscall/syscall_linux_s390x.go | 1 + 12 files changed, 259 insertions(+) diff --git a/src/syscall/exec_linux.go b/src/syscall/exec_linux.go index e6d6343ed8..bf88d14fa5 100644 --- a/src/syscall/exec_linux.go +++ b/src/syscall/exec_linux.go @@ -105,6 +105,22 @@ type SysProcAttr struct { // functionality is supported by the kernel, or -1. Note *PidFD is // changed only if the process starts successfully. PidFD *int + // JoinNamespaces to join after fork and before exec. Namespaces are joined + // before any unshare calls. If you are using CloneFlags note that those + // flags will be used to do the initial fork, so they occur before joining + // these namespaces. It is expected that the caller has sorted the list in + // the order they want to join. It is possible for ordering to affect + // permissions to join other namespaces. + JoinNamespaces []LinuxNamespace +} + +// LinuxNamespace represents a Linux namespace that can be joined by a process. +// See [SysProcAttr.Namespaces]. +type LinuxNamespace struct { + // Type of namespace that FD refers to. + Type int + // FD is the file descriptor referring to the namespace. + FD int } var ( @@ -352,6 +368,18 @@ func forkAndExecInChild1(argv0 *byte, argv, envv []*byte, chroot, dir *byte, att } } + // TODO: I think pid namespaces require some custom handling here. + for _, ns := range sys.JoinNamespaces { + _, _, err1 = RawSyscall(_SYS_setns, uintptr(ns.FD), uintptr(ns.Type), 0) + if err1 != 0 { + goto childerror + } + _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(ns.FD), 0, 0) + if err1 != 0 { + goto childerror + } + } + // Wait for User ID/Group ID mappings to be written. if sys.UidMappings != nil || sys.GidMappings != nil { if _, _, err1 = RawSyscall(SYS_CLOSE, uintptr(mapPipe[1]), 0, 0); err1 != 0 { diff --git a/src/syscall/exec_linux_test.go b/src/syscall/exec_linux_test.go index 68ec6fe3f8..6e5a303c67 100644 --- a/src/syscall/exec_linux_test.go +++ b/src/syscall/exec_linux_test.go @@ -477,6 +477,227 @@ func TestUseCgroupFD(t *testing.T) { } } +func getProcessNamespaces(pid int) (map[string]string, error) { + ls, err := os.ReadDir(fmt.Sprintf("/proc/%d/ns", pid)) + if err != nil { + return nil, err + } + + out := make(map[string]string) + for _, ns := range ls { + id, err := os.Readlink(filepath.Join("/proc", strconv.Itoa(pid), "ns", ns.Name())) + if err != nil { + return nil, err + } + out[ns.Name()] = id + } + return out, nil +} + +func createNamespaces(t *testing.T, unshareFlags uintptr) map[string]*os.File { + t.Helper() + + // Create a new namespace by re-execing the test binary with clone flags set. + // We'll capture the file descriptor of the new namespace and pass it back to the caller. + // As long as the file descriptor is open the namespace will be valid. + + cmd := testCmdReexec(t) + cmd.Env = append(cmd.Env, "GO_TEST_CREATE_NAMSPACES=1") + cmd.SysProcAttr = &syscall.SysProcAttr{ + Cloneflags: unshareFlags, + } + + // The child process will block on stdin so we can read the namespace before it exits. + // Once we close the child will exit + pr, pw := io.Pipe() + cmd.Stdin = pr + defer func() { + // Close pw so the child can exit. + // We don't need this alive anymore after the helper function returns. + pw.Close() + if err := cmd.Wait(); err != nil { + t.Error(err) + } + }() + + if err := cmd.Start(); err != nil { + if testenv.SyscallIsNotSupported(err) { + t.Skipf("skipping due to permissions error: %v", err) + } + t.Fatal(err) + } + + ls, err := getProcessNamespaces(cmd.Process.Pid) + if err != nil { + t.Fatal(err) + } + + origNS, err := getProcessNamespaces(os.Getpid()) + if err != nil { + t.Fatal(err) + } + + // Open a new file for all namespaces created by the child. + // This would be any namespace that has a different id than the original namespaces. + out := make(map[string]*os.File) + for kind, id := range ls { + if strings.HasSuffix(id, "_for_children") { + // This is not a namespace, but rather a namespace that child proccesses will inherit. + // We aren't interested in this. + continue + } + + if origNS[kind] == id { + t.Log("skipping namespace", kind, "as it is the same as the parent") + continue + } + + f, err := os.Open(fmt.Sprintf("/proc/%d/ns/%s", cmd.Process.Pid, kind)) + if err != nil { + t.Fatal(err) + } + t.Cleanup(func() { f.Close() }) + out[kind] = f + } + + return out +} + +func testCmdReexec(t *testing.T) *exec.Cmd { + t.Helper() + exe, err := os.Executable() + if err != nil { + t.Fatal(err) + } + + cmd := testenv.Command(t, exe, "-test.run=^"+t.Name()+"$") + cmd.Env = append(cmd.Environ(), "GO_WANT_HELPER_PROCESS=1") + return cmd +} + +func TestJoinNamespaces(t *testing.T) { + testenv.MustHaveExec(t) + + if os.Getenv("GO_WANT_HELPER_PROCESS") == "1" { + if os.Getenv("GO_TEST_CREATE_NAMSPACES") == "1" { + // Block on stdin, we don't care about this value. + // Once stdin is closed by the caller that is our signal to exit. + os.Stdin.Read(make([]byte, 1)) + os.Exit(0) + } + + ls, err := getProcessNamespaces(os.Getpid()) + if err != nil { + fmt.Fprintln(os.Stderr, err) + os.Exit(2) + } + + for _, ns := range ls { + fmt.Println(ns) + } + + os.Exit(0) + } + + // Mapping for the namespaces we'll be testing with + testNSKinds := map[string]uintptr{ + "mnt": syscall.CLONE_NEWNS, + "uts": syscall.CLONE_NEWUTS, + "net": syscall.CLONE_NEWNET, + } + + cmdWithJoins := func(fds map[string]*os.File, ordered ...string) *exec.Cmd { + cmd := testCmdReexec(t) + cmd.SysProcAttr = &syscall.SysProcAttr{} + + for _, kind := range ordered { + k, ok := testNSKinds[kind] + if !ok { + panic("unexpected namespace kind: " + kind) + } + + f, ok := fds[kind] + if !ok { + panic(fmt.Sprintf("missing namespace fd for %q: %v", kind, fds)) + } + + cmd.SysProcAttr.JoinNamespaces = append(cmd.SysProcAttr.JoinNamespaces, syscall.LinuxNamespace{ + Type: int(k), + FD: int(f.Fd()), + }) + } + + return cmd + } + + ourNsLS, err := getProcessNamespaces(os.Getpid()) + if err != nil { + t.Fatal(err) + } + + checkNs := func(t *testing.T, created map[string]*os.File, cmdOut []byte) { + out := strings.TrimSpace(string(cmdOut)) + + for _, v := range strings.Split(out, "\n") { + // link format is like ":[4026531840]" + // We want to check what kind of ns this refers to + kind, _, ok := strings.Cut(v, ":") + if !ok { + t.Fatalf("unexpected output from child: %q", out) + } + + ours, ok := ourNsLS[kind] + if !ok { + t.Fatalf("unexpected namespace kind: %q", kind) + } + + if _, ok := created[kind]; ok { + // This is one we should have joined so it should be different from our namespace. + if v == ours { + t.Errorf("subprocess did not join new ns for %q", kind) + } + } else { + // This is one we should not have joined so it should be the same as our namespace. + if v != ours { + t.Errorf("subprocess joined ns for %q", kind) + } + } + } + } + + t.Run("one namespace", func(t *testing.T) { + nsFDs := createNamespaces(t, syscall.CLONE_NEWNS) + + if _, ok := nsFDs["mnt"]; !ok { + t.Fatal("expected to create a namespace for mnt") + } + + out, err := cmdWithJoins(nsFDs, "mnt").CombinedOutput() + if err != nil { + t.Fatalf("cmd failed with err %v, output: %s", err, out) + } + + checkNs(t, nsFDs, out) + }) + + t.Run("multiple namespaces", func(t *testing.T) { + nsFDs := createNamespaces(t, syscall.CLONE_NEWNS|syscall.CLONE_NEWNET|syscall.CLONE_NEWUTS) + + for kind := range testNSKinds { + if _, ok := nsFDs[kind]; !ok { + t.Fatalf("expected to create a namespace for %q", kind) + } + } + + out, err := cmdWithJoins(nsFDs, "mnt", "net", "uts").CombinedOutput() + if err != nil { + t.Fatalf("cmd failed with err %v, output: %s", err, out) + } + + checkNs(t, nsFDs, out) + }) +} + func TestCloneTimeNamespace(t *testing.T) { testenv.MustHaveExec(t) diff --git a/src/syscall/syscall_linux_386.go b/src/syscall/syscall_linux_386.go index a559f7e288..39138bae9b 100644 --- a/src/syscall/syscall_linux_386.go +++ b/src/syscall/syscall_linux_386.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) func setTimespec(sec, nsec int64) Timespec { diff --git a/src/syscall/syscall_linux_amd64.go b/src/syscall/syscall_linux_amd64.go index ec52f8a4bd..2012e0aa9c 100644 --- a/src/syscall/syscall_linux_amd64.go +++ b/src/syscall/syscall_linux_amd64.go @@ -13,6 +13,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = 308 ) //sys Dup2(oldfd int, newfd int) (err error) diff --git a/src/syscall/syscall_linux_arm.go b/src/syscall/syscall_linux_arm.go index a6d92cea13..6d2b1ee592 100644 --- a/src/syscall/syscall_linux_arm.go +++ b/src/syscall/syscall_linux_arm.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) func setTimespec(sec, nsec int64) Timespec { diff --git a/src/syscall/syscall_linux_arm64.go b/src/syscall/syscall_linux_arm64.go index b87b51c0c0..18343feff2 100644 --- a/src/syscall/syscall_linux_arm64.go +++ b/src/syscall/syscall_linux_arm64.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) //sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT diff --git a/src/syscall/syscall_linux_loong64.go b/src/syscall/syscall_linux_loong64.go index 634cf30cf2..b744975767 100644 --- a/src/syscall/syscall_linux_loong64.go +++ b/src/syscall/syscall_linux_loong64.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) //sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT diff --git a/src/syscall/syscall_linux_mips64x.go b/src/syscall/syscall_linux_mips64x.go index 41106ed81f..9cb26d77ea 100644 --- a/src/syscall/syscall_linux_mips64x.go +++ b/src/syscall/syscall_linux_mips64x.go @@ -15,6 +15,7 @@ const ( _SYS_clone3 = 5435 _SYS_faccessat2 = 5439 _SYS_fchmodat2 = 5452 + _SYS_setns = SYS_SETNS ) //sys Dup2(oldfd int, newfd int) (err error) diff --git a/src/syscall/syscall_linux_mipsx.go b/src/syscall/syscall_linux_mipsx.go index 7d4f8f2264..0af6a3bbfb 100644 --- a/src/syscall/syscall_linux_mipsx.go +++ b/src/syscall/syscall_linux_mipsx.go @@ -13,6 +13,7 @@ const ( _SYS_clone3 = 4435 _SYS_faccessat2 = 4439 _SYS_fchmodat2 = 4452 + _SYS_setns = SYS_SETNS ) func Syscall9(trap, a1, a2, a3, a4, a5, a6, a7, a8, a9 uintptr) (r1, r2 uintptr, err Errno) diff --git a/src/syscall/syscall_linux_ppc64x.go b/src/syscall/syscall_linux_ppc64x.go index 13c184c44f..9737fe9391 100644 --- a/src/syscall/syscall_linux_ppc64x.go +++ b/src/syscall/syscall_linux_ppc64x.go @@ -15,6 +15,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) //sys Dup2(oldfd int, newfd int) (err error) diff --git a/src/syscall/syscall_linux_riscv64.go b/src/syscall/syscall_linux_riscv64.go index 00872a74fb..a64abf1632 100644 --- a/src/syscall/syscall_linux_riscv64.go +++ b/src/syscall/syscall_linux_riscv64.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) //sys EpollWait(epfd int, events []EpollEvent, msec int) (n int, err error) = SYS_EPOLL_PWAIT diff --git a/src/syscall/syscall_linux_s390x.go b/src/syscall/syscall_linux_s390x.go index ea667ec1da..e4806867cb 100644 --- a/src/syscall/syscall_linux_s390x.go +++ b/src/syscall/syscall_linux_s390x.go @@ -11,6 +11,7 @@ const ( _SYS_clone3 = 435 _SYS_faccessat2 = 439 _SYS_fchmodat2 = 452 + _SYS_setns = SYS_SETNS ) //sys Dup2(oldfd int, newfd int) (err error)