// Copyright (c) 2021 Gitpod GmbH. All rights reserved. // Licensed under the GNU Affero General Public License (AGPL). // See License.AGPL.txt in the project root for license information. package seccomp import ( "context" "errors" "fmt" "io" "io/fs" "os" "path/filepath" "strconv" "strings" "syscall" "time" "github.com/moby/sys/mountinfo" "golang.org/x/sys/unix" "golang.org/x/xerrors" "github.com/gitpod-io/gitpod/common-go/log" "github.com/gitpod-io/gitpod/workspacekit/pkg/readarg" daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api" libseccomp "github.com/seccomp/libseccomp-golang" ) type syscallHandler func(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) // SyscallHandler handles seccomp syscall notifications type SyscallHandler interface { Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) } func mapHandler(h SyscallHandler) map[string]syscallHandler { return map[string]syscallHandler{ "mount": h.Mount, "umount": h.Umount, "umount2": h.Umount, "bind": h.Bind, "chown": h.Chown, } } // LoadFilter loads the syscall filter required to make the handler work. // Calling this function has a range of side-effects: // - we'll lock the caller using `runtime.LockOSThread()` // - we'll set no_new_privs on the process func LoadFilter() (libseccomp.ScmpFd, error) { filter, err := libseccomp.NewFilter(libseccomp.ActAllow) if err != nil { return 0, xerrors.Errorf("cannot create filter: %w", err) } err = filter.SetTsync(false) if err != nil { return 0, xerrors.Errorf("cannot set tsync: %w", err) } err = filter.SetNoNewPrivsBit(false) if err != nil { return 0, xerrors.Errorf("cannot set no_new_privs: %w", err) } // we explicitly prohibit open_tree/move_mount to prevent container workloads // from moving a proc mask using open_tree(..., CLONE|RECURSIVE). deniedSyscalls := []string{ "open_tree", "move_mount", } for _, sc := range deniedSyscalls { syscallID, err := libseccomp.GetSyscallFromName(sc) if err != nil { return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err) } err = filter.AddRule(syscallID, libseccomp.ActErrno.SetReturnCode(int16(unix.EPERM))) if err != nil { return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err) } } handledSyscalls := mapHandler(&InWorkspaceHandler{}) for sc := range handledSyscalls { syscallID, err := libseccomp.GetSyscallFromName(sc) if err != nil { return 0, xerrors.Errorf("unknown syscall %s: %w", sc, err) } err = filter.AddRule(syscallID, libseccomp.ActNotify) if err != nil { return 0, xerrors.Errorf("cannot add rule for %s: %w", sc, err) } } err = filter.Load() if err != nil { return 0, xerrors.Errorf("cannot load filter: %w", err) } fd, err := filter.GetNotifFd() if err != nil { return 0, xerrors.Errorf("cannot get inotif fd: %w", err) } return fd, nil } const ( // IWS backoff is the backoff configuration we use for interacting with the in-workspace service iwsBackoffInitialWait = 10 * time.Millisecond iwsBackoffSteps = 6 iwsBackoffFactor = 5 iwsBackoffMaxWait = 2500 * time.Millisecond ) // Handle actually listens on the seccomp notif FD and handles incoming requests. // This function returns when the notif FD is closed. func Handle(fd libseccomp.ScmpFd, handler SyscallHandler, wsid string) (stop chan<- struct{}, errchan <-chan error) { log := log.WithField("workspaceId", wsid) ec := make(chan error) stp := make(chan struct{}) handledSyscalls := mapHandler(handler) go func() { for { req, err := libseccomp.NotifReceive(fd) if err != nil { if err == syscall.ENOENT { log.WithError(err).Warn("failed to get notification beucase it has already been not valid anymore(the kernel sets that)") continue } log.WithError(err).Error("failed to get notification") ec <- err if err == unix.ECANCELED { return } continue } select { case <-stp: // if we're asked stop we might still have to answer a syscall. // We do this on a best effort basis answering with EPERM. if err != nil { _ = libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{ ID: req.ID, Error: 1, Val: 0, Flags: 0, }) } default: } go func() { syscallName, _ := req.Data.Syscall.GetName() handler, ok := handledSyscalls[syscallName] if !ok { handler = handleUnknownSyscall } val, errno, flags := handler(req) ierr := libseccomp.NotifRespond(fd, &libseccomp.ScmpNotifResp{ ID: req.ID, Error: errno, Val: val, Flags: flags, }) if ierr != nil { log.WithError(ierr).Error("failed to return notification response") ec <- ierr } }() } }() return stp, ec } func handleUnknownSyscall(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) { nme, _ := req.Data.Syscall.GetName() log.WithField("syscall", nme).Warn("don't know how to handle this syscall") return 0, 1, 0 } func Errno(err unix.Errno) (val uint64, errno int32, flags uint32) { return ^uint64(0), int32(errno), 0 } // IWSClientProvider provides a client to the in-workspace-service. // Consumers of this provider will close the client after use. type IWSClientProvider func(ctx context.Context) (InWorkspaceServiceClient, error) type InWorkspaceServiceClient interface { daemonapi.InWorkspaceServiceClient io.Closer } // InWorkspaceHandler is the seccomp notification handler that serves a Gitpod workspace type InWorkspaceHandler struct { FD libseccomp.ScmpFd Daemon IWSClientProvider Ring2PID int Ring2Rootfs string BindEvents chan<- BindEvent WorkspaceId string } // BindEvent describes a process binding to a socket type BindEvent struct { PID uint32 } // Mount handles mount syscalls func (h *InWorkspaceHandler) Mount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) { log := log.WithFields(map[string]interface{}{ "syscall": "mount", log.WorkspaceIDField: h.WorkspaceId, "pid": req.Pid, "id": req.ID, }) memFile, err := readarg.OpenMem(req.Pid) if err != nil { log.WithError(err).Error("cannot open mem") return Errno(unix.EPERM) } defer memFile.Close() err = libseccomp.NotifIDValid(h.FD, req.ID) if err != nil { log.WithError(err).Error("invalid notify ID", req.ID) return Errno(unix.EPERM) } source, err := readarg.ReadString(memFile, int64(req.Data.Args[0])) if err != nil { log.WithField("arg", 0).WithError(err).Error("cannot read argument") return Errno(unix.EFAULT) } dest, err := readarg.ReadString(memFile, int64(req.Data.Args[1])) if err != nil { log.WithField("arg", 1).WithError(err).Error("cannot read argument") return Errno(unix.EFAULT) } filesystem, err := readarg.ReadString(memFile, int64(req.Data.Args[2])) if err != nil { log.WithField("arg", 2).WithError(err).Error("cannot read argument") return Errno(unix.EFAULT) } var args string if len(req.Data.Args) >= 5 && filesystem == "nfs4" { args, err = readarg.ReadString(memFile, int64(req.Data.Args[4])) log.WithField("arg", 4).WithError(err).Error("cannot read argument") } log.WithFields(map[string]interface{}{ "source": source, "dest": dest, "fstype": filesystem, "args": args, }).Info("handling mount syscall") if filesystem == "proc" || filesystem == "sysfs" || filesystem == "nfs4" { // When a process wants to mount proc relative to `/proc/self` that path has no meaning outside of the processes' context. // runc started doing this in https://github.com/opencontainers/runc/commit/0ca91f44f1664da834bc61115a849b56d22f595f // TODO(cw): there must be a better way to handle this. Find one. target := filepath.Join(h.Ring2Rootfs, dest) if strings.HasPrefix(dest, "/proc/self/") { target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/self/")) } if strings.HasPrefix(dest, "/proc/thread-self/") { target = filepath.Join("/proc", strconv.Itoa(int(req.Pid)), strings.TrimPrefix(dest, "/proc/thread-self/")) } stat, err := os.Lstat(target) if errors.Is(err, fs.ErrNotExist) { err = os.MkdirAll(target, 0755) } if err != nil { log.WithField("target", target).WithField("dest", dest).WithError(err).Error("cannot stat mountpoint") return Errno(unix.EFAULT) } if stat != nil { if stat.Mode()&os.ModeSymlink != 0 { // The symlink is already expressed relative to the ring2 mount namespace, no need to faff with the rootfs paths. // In case this was a /proc relative symlink, we'll have that symlink resolved here, hence make it work in the mount namespace of ring2. dest, err = os.Readlink(target) if err != nil { log.WithField("target", target).WithField("dest", dest).WithError(err).Errorf("cannot resolve %s mount target symlink", filesystem) return Errno(unix.EFAULT) } } else if stat.Mode()&os.ModeDir == 0 { log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("%s must be mounted on an ordinary directory", filesystem) return Errno(unix.EPERM) } } wait := iwsBackoffInitialWait for i := 0; i < iwsBackoffSteps; i++ { err = func() error { ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) defer cancel() iws, err := h.Daemon(ctx) if err != nil { log.WithField("target", target).WithField("dest", dest).WithField("mode", stat.Mode()).WithError(err).Errorf("cannot get IWS client to mount %s", filesystem) return err } defer iws.Close() call := iws.MountProc if filesystem == "sysfs" { call = iws.MountSysfs } if filesystem == "sysfs" || filesystem == "proc" { _, err = call(ctx, &daemonapi.MountProcRequest{ Target: dest, Pid: int64(req.Pid), }) } else if filesystem == "nfs4" { _, err = iws.MountNfs(ctx, &daemonapi.MountNfsRequest{ Source: source, Target: dest, Args: args, Pid: int64(req.Pid), }) } if err != nil { log.WithField("target", dest).WithError(err).Errorf("cannot mount %s", filesystem) return err } return nil }() if err != nil { time.Sleep(wait) wait = wait * iwsBackoffFactor if wait > iwsBackoffMaxWait { wait = iwsBackoffMaxWait } } else { break } } if err != nil { // We've already logged the reason above return Errno(unix.EFAULT) } return 0, 0, 0 } // let the kernel do the work return 0, 0, libseccomp.NotifRespFlagContinue } // Umount handles umount and umount2 syscalls func (h *InWorkspaceHandler) Umount(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) { nme, _ := req.Data.Syscall.GetName() log := log.WithFields(map[string]interface{}{ "syscall": nme, log.WorkspaceIDField: h.WorkspaceId, "pid": req.Pid, "id": req.ID, }) memFile, err := readarg.OpenMem(req.Pid) if err != nil { log.WithError(err).Error("cannot open mem") return Errno(unix.EPERM) } defer memFile.Close() target, err := readarg.ReadString(memFile, int64(req.Data.Args[0])) if err != nil { log.WithField("arg", 0).WithError(err).Error("cannot read argument") return Errno(unix.EFAULT) } target = strings.TrimSuffix(target, "/") fd, err := os.Open(fmt.Sprintf("/proc/%d/mountinfo", req.Pid)) if err != nil { log.WithError(err).Error("cannot read mountinfo") return Errno(unix.EFAULT) } defer fd.Close() mnts, err := mountinfo.GetMountsFromReader(fd, func(i *mountinfo.Info) (skip bool, stop bool) { return false, false }) if err != nil { log.WithError(err).Error("cannot parse mountinfo") return Errno(unix.EFAULT) } procMounts := make(map[string]struct{}) for _, mnt := range mnts { if mnt.FSType == "proc" { procMounts[mnt.Mountpoint] = struct{}{} } } if _, ok := procMounts[target]; ok { // ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) // defer cancel() // _, err = h.Daemon.UmountProc(ctx, &daemonapi.UmountProcRequest{ // Target: target, // Pid: int64(req.Pid), // }) // if err != nil { // log.WithError(err).Error("cannot umount proc mount") // return Errno(unix.EFAULT) // } // log.WithField("target", target).Info("umounted proc mount") // return 0, 0, 0 // proc umounting doesn't work yet from ws-daemon. Instead EPERM here. // In most cases that's not a problem because in-workspace proc mounts // usually happen within a mount namespace anyways, for which the kernel // lazy umounts everything that's just attached within that namespace. // TODO(cw): make proc umounting work in ws-dameon. return Errno(unix.EPERM) } var isProcMountChild bool for procMount := range procMounts { if strings.HasPrefix(target, procMount) { isProcMountChild = true break } } if isProcMountChild { log.WithField("target", target).Warn("user attempted to umount proc mask") return Errno(unix.EPERM) } // let the kernel do the work return 0, 0, libseccomp.NotifRespFlagContinue } func (h *InWorkspaceHandler) Bind(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) { log := log.WithFields(map[string]interface{}{ "syscall": "bind", log.WorkspaceIDField: h.WorkspaceId, "pid": req.Pid, "id": req.ID, }) // We want the syscall to succeed, no matter what we do in this handler. // The Kernel will execute the syscall for us. defer func() { val = 0 errno = 0 flags = libseccomp.NotifRespFlagContinue }() memFile, err := readarg.OpenMem(req.Pid) if err != nil { log.WithError(err).Error("cannot open mem") return } defer memFile.Close() // TODO(cw): find why this breaks // err = libseccomp.NotifIDValid(fd, req.ID) // if err != nil { // log.WithError(err).Error("invalid notif ID") // return returnErrno(unix.EPERM) // } evt := BindEvent{PID: req.Pid} select { case h.BindEvents <- evt: default: } // socketFdB, err := readarg.ReadBytes(memFile, int64(req.Data.Args[0]), int(req.Data.Args[1]-req.Data.Args[0])) // if err != nil { // log.WithError(err).Error("cannot read socketfd arg") // } // socketfd := nativeEndian.Uint64(socketFdB) // unix.Getsockname() return } func (h *InWorkspaceHandler) Chown(req *libseccomp.ScmpNotifReq) (val uint64, errno int32, flags uint32) { log := log.WithFields(map[string]interface{}{ "syscall": "chown", log.WorkspaceIDField: h.WorkspaceId, "pid": req.Pid, "id": req.ID, }) memFile, err := readarg.OpenMem(req.Pid) if err != nil { log.WithError(err).Error("cannot open mem") return } defer memFile.Close() pth, err := readarg.ReadString(memFile, int64(req.Data.Args[0])) if err != nil { log.WithError(err).Error("cannot open mem") return } if strings.HasPrefix(pth, "/dev/pts") { return 0, 0, 0 } return 0, 0, libseccomp.NotifRespFlagContinue } /* var nativeEndian binary.ByteOrder func init() { buf := [2]byte{} *(*uint16)(unsafe.Pointer(&buf[0])) = uint16(0xABCD) switch buf { case [2]byte{0xCD, 0xAB}: nativeEndian = binary.LittleEndian case [2]byte{0xAB, 0xCD}: nativeEndian = binary.BigEndian default: panic("Could not determine native endianness.") } } */