gitpod/components/ws-daemon/pkg/iws/iws.go

// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License.AGPL.txt in the project root for license information.

package iws

import (
	"context"
	"errors"
	"fmt"
	"io"
	"io/fs"
	"math"
	"net"
	"os"
	"os/exec"
	"path/filepath"
	"strconv"
	"strings"
	"sync"
	"syscall"
	"time"

	"github.com/opentracing/opentracing-go"
	"golang.org/x/sys/unix"
	"golang.org/x/time/rate"
	"golang.org/x/xerrors"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/status"

	linuxproc "github.com/c9s/goprocinfo/linux"
	"github.com/gitpod-io/gitpod/common-go/cgroups"
	v1 "github.com/gitpod-io/gitpod/common-go/cgroups/v1"
	v2 "github.com/gitpod-io/gitpod/common-go/cgroups/v2"
	"github.com/gitpod-io/gitpod/common-go/log"
	"github.com/gitpod-io/gitpod/common-go/tracing"
	wsinit "github.com/gitpod-io/gitpod/content-service/pkg/initializer"
	"github.com/gitpod-io/gitpod/ws-daemon/api"
	"github.com/gitpod-io/gitpod/ws-daemon/pkg/container"
	"github.com/gitpod-io/gitpod/ws-daemon/pkg/internal/session"
	nsi "github.com/gitpod-io/gitpod/ws-daemon/pkg/nsinsider"
)

//
// BEWARE
// The code in this file, i.e. everything offered by InWorkspaceHelperServer is accessible without further protection
// by user-reachable code. There's no server or ws-man in front of this interface. Keep this interface minimal, and
// be defensive!
//
// IWS services are made available to workspaces through the workspace dispatch.
// When a new workspace is added, the dispatch listener creates a new gRPC server socket,
// and tears it down when the workspace is removed (i.e. the workspace context is canceled).
//

var (
	// These *must* be kept in sync with moby/moby and kubernetes/kubernetes.
	// https://github.com/moby/moby/blob/master/oci/defaults.go#L116-L134
	// https://github.com/kubernetes/kubernetes/blob/master/pkg/securitycontext/util.go#L200-L218
	//
	// Compared to the origin of this list, we imply the /proc prefix.
	// That means we don't list the prefix, but also we only list files/dirs here which
	// reside in /proc (e.g. not /sys/firmware).
	procDefaultMaskedPaths = []string{
		"acpi",
		"kcore",
		"keys",
		"latency_stats",
		"timer_list",
		"timer_stats",
		"sched_debug",
		"scsi",
	}
	procDefaultReadonlyPaths = []string{
		"asound",
		"bus",
		"fs",
		"irq",
		"sys",
		"sysrq-trigger",
	}
	sysfsDefaultMaskedPaths = []string{
		"firmware",
	}
)

// ServeWorkspace establishes the IWS server for a workspace
func ServeWorkspace(uidmapper *Uidmapper, fsshift api.FSShiftMethod, cgroupMountPoint string) func(ctx context.Context, ws *session.Workspace) error {
	return func(ctx context.Context, ws *session.Workspace) (err error) {
		span, _ := opentracing.StartSpanFromContext(ctx, "iws.ServeWorkspace")
		defer tracing.FinishSpan(span, &err)
		if _, running := ws.NonPersistentAttrs[session.AttrWorkspaceServer]; running {
			span.SetTag("alreadyRunning", true)
			return nil
		}

		iws := &InWorkspaceServiceServer{
			Uidmapper:        uidmapper,
			Session:          ws,
			FSShift:          fsshift,
			CGroupMountPoint: cgroupMountPoint,
		}
		err = iws.Start()
		if err != nil {
			return xerrors.Errorf("cannot start in-workspace-helper server: %w", err)
		}

		log.WithFields(ws.OWI()).Debug("established IWS server")
		ws.NonPersistentAttrs[session.AttrWorkspaceServer] = iws.Stop

		return nil
	}
}

// StopServingWorkspace stops a previously started workspace server
func StopServingWorkspace(ctx context.Context, ws *session.Workspace) (err error) {
	//nolint:ineffassign
	span, _ := opentracing.StartSpanFromContext(ctx, "iws.StopServingWorkspace")
	defer tracing.FinishSpan(span, &err)

	rawStop, ok := ws.NonPersistentAttrs[session.AttrWorkspaceServer]
	if !ok {
		return nil
	}

	stopFn, ok := rawStop.(func())
	if !ok {
		return nil
	}

	stopFn()
	log.WithFields(ws.OWI()).Debug("stopped IWS server")
	return nil
}

// InWorkspaceServiceServer implements the workspace facing backup services
type InWorkspaceServiceServer struct {
	Uidmapper        *Uidmapper
	Session          *session.Workspace
	FSShift          api.FSShiftMethod
	CGroupMountPoint string

	srv  *grpc.Server
	sckt io.Closer

	api.UnimplementedInWorkspaceServiceServer
}

// Start creates the syscall socket the IWS server listens on, and starts the gRPC server on it
func (wbs *InWorkspaceServiceServer) Start() error {
	// It's possible that the kubelet hasn't create the ServiceLocDaemon directory yet.
	err := os.MkdirAll(wbs.Session.ServiceLocDaemon, 0755)
	if err != nil && !os.IsExist(err) {
		return xerrors.Errorf("cannot create ServiceLocDaemon: %w", err)
	}

	socketFN := filepath.Join(wbs.Session.ServiceLocDaemon, "daemon.sock")
	if _, err := os.Stat(socketFN); err == nil {
		// a former ws-daemon instance left their sockets laying around.
		// Let's clean up after them.
		_ = os.Remove(socketFN)
	}
	sckt, err := net.Listen("unix", socketFN)
	if err != nil {
		return xerrors.Errorf("cannot create IWS socket: %w", err)
	}

	err = os.Chmod(socketFN, 0777)
	if err != nil {
		return xerrors.Errorf("cannot chmod IWS socket: %w", err)
	}

	wbs.sckt = sckt

	limits := ratelimitingInterceptor{
		"/iws.InWorkspaceService/PrepareForUserNS": ratelimit{
			UseOnce: true,
		},
		"/iws.InWorkspaceService/EvacuateCGroup": ratelimit{
			UseOnce: true,
		},
		"/iws.InWorkspaceService/WriteIDMapping": ratelimit{
			Limiter: rate.NewLimiter(rate.Every(2500*time.Millisecond), 4),
		},
		"/iws.InWorkspaceService/Teardown": ratelimit{
			UseOnce: true,
		},
		"/iws.InWorkspaceService/WorkspaceInfo": ratelimit{
			Limiter: rate.NewLimiter(rate.Every(1500*time.Millisecond), 4),
		},
	}

	wbs.srv = grpc.NewServer(grpc.ChainUnaryInterceptor(limits.UnaryInterceptor()))
	api.RegisterInWorkspaceServiceServer(wbs.srv, wbs)
	go func() {
		err := wbs.srv.Serve(sckt)
		if err != nil {
			log.WithError(err).WithFields(wbs.Session.OWI()).Error("IWS server failed")
		}
	}()
	return nil
}

// Stop stops the service and closes the socket
func (wbs *InWorkspaceServiceServer) Stop() {
	defer wbs.sckt.Close()
	wbs.srv.GracefulStop()
}

// PrepareForUserNS mounts the workspace's shiftfs mark
func (wbs *InWorkspaceServiceServer) PrepareForUserNS(ctx context.Context, req *api.PrepareForUserNSRequest) (*api.PrepareForUserNSResponse, error) {
	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("PrepareForUserNS: cannot find workspace container")
		return nil, status.Errorf(codes.Internal, "cannot find workspace container")
	}

	rootfs, err := rt.ContainerRootfs(ctx, wscontainerID, container.OptsContainerRootfs{Unmapped: true})
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("PrepareForUserNS: cannot find workspace rootfs")
		return nil, status.Errorf(codes.Internal, "cannot find workspace rootfs")
	}

	containerPID, err := rt.ContainerPID(ctx, wscontainerID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("PrepareForUserNS: cannot find workspace container PID")
		return nil, status.Errorf(codes.Internal, "cannot find workspace rootfs")
	}

	log.WithField("type", wbs.FSShift).Debug("FSShift")

	// user namespace support for FUSE landed in Linux 4.18:
	//   - http://lkml.iu.edu/hypermail/linux/kernel/1806.0/04385.html
	// Development leading up to this point:
	//   - https://lists.linuxcontainers.org/pipermail/lxc-devel/2014-July/009797.html
	//   - https://lists.linuxcontainers.org/pipermail/lxc-users/2014-October/007948.html
	err = nsi.Nsinsider(wbs.Session.InstanceID, int(containerPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "prepare-dev", "--uid", strconv.Itoa(wsinit.GitpodUID), "--gid", strconv.Itoa(wsinit.GitpodGID))
	})
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("PrepareForUserNS: cannot prepare /dev")
		return nil, status.Errorf(codes.Internal, "cannot prepare /dev")
	}

	// create overlayfs directories to be used in ring2 as rootfs and also upper layer to track changes in the workspace
	_ = os.MkdirAll(filepath.Join(wbs.Session.ServiceLocDaemon, "upper"), 0755)
	_ = os.MkdirAll(filepath.Join(wbs.Session.ServiceLocDaemon, "work"), 0755)
	_ = os.MkdirAll(filepath.Join(wbs.Session.ServiceLocDaemon, "mark"), 0755)

	mountpoint := filepath.Join(wbs.Session.ServiceLocNode, "mark")

	if wbs.FSShift == api.FSShiftMethod_FUSE || wbs.Session.FullWorkspaceBackup {
		err = nsi.Nsinsider(wbs.Session.InstanceID, int(1), func(c *exec.Cmd) {
			// In case of any change in the user mapping, the next line must be updated.
			mappings := fmt.Sprintf("0:%v:1:1:100000:65534", wsinit.GitpodUID)
			c.Args = append(c.Args, "mount-fusefs-mark",
				"--source", rootfs,
				"--merged", filepath.Join(wbs.Session.ServiceLocNode, "mark"),
				"--upper", filepath.Join(wbs.Session.ServiceLocNode, "upper"),
				"--work", filepath.Join(wbs.Session.ServiceLocNode, "work"),
				"--uidmapping", mappings,
				"--gidmapping", mappings)
		})
		if err != nil {
			log.WithField("rootfs", rootfs).WithError(err).Error("cannot mount fusefs mark")
			return nil, status.Errorf(codes.Internal, "cannot mount fusefs mark")
		}

		log.WithFields(wbs.Session.OWI()).WithField("configuredShift", wbs.FSShift).WithField("fwb", wbs.Session.FullWorkspaceBackup).Info("fs-shift using fuse")

		if err := wbs.createWorkspaceCgroup(ctx, wscontainerID); err != nil {
			return nil, err
		}

		return &api.PrepareForUserNSResponse{
			FsShift:               api.FSShiftMethod_FUSE,
			FullWorkspaceBackup:   wbs.Session.FullWorkspaceBackup,
			PersistentVolumeClaim: wbs.Session.PersistentVolumeClaim,
		}, nil
	}

	// We cannot use the nsenter syscall here because mount namespaces affect the whole process, not just the current thread.
	// That's why we resort to exec'ing "nsenter ... mount ...".
	err = nsi.Nsinsider(wbs.Session.InstanceID, int(1), func(c *exec.Cmd) {
		c.Args = append(c.Args, "make-shared", "--target", "/")
	})
	if err != nil {
		log.WithField("containerPID", containerPID).WithError(err).Error("cannot make container's rootfs shared")
		return nil, status.Errorf(codes.Internal, "cannot make container's rootfs shared")
	}

	err = nsi.Nsinsider(wbs.Session.InstanceID, int(1), func(c *exec.Cmd) {
		c.Args = append(c.Args, "mount-shiftfs-mark", "--source", rootfs, "--target", mountpoint)
	})
	if err != nil {
		log.WithField("rootfs", rootfs).WithField("mountpoint", mountpoint).WithError(err).Error("cannot mount shiftfs mark")
		return nil, status.Errorf(codes.Internal, "cannot mount shiftfs mark")
	}

	if err := wbs.createWorkspaceCgroup(ctx, wscontainerID); err != nil {
		return nil, err
	}

	return &api.PrepareForUserNSResponse{
		FsShift:               api.FSShiftMethod_SHIFTFS,
		FullWorkspaceBackup:   wbs.Session.FullWorkspaceBackup,
		PersistentVolumeClaim: wbs.Session.PersistentVolumeClaim,
	}, nil
}

func (wbs *InWorkspaceServiceServer) createWorkspaceCgroup(ctx context.Context, wscontainerID container.ID) error {
	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}

	unified, err := cgroups.IsUnifiedCgroupSetup()
	if err != nil {
		// log error and do not expose it to the user
		log.WithError(err).Error("could not determine cgroup setup")
		return status.Errorf(codes.FailedPrecondition, "could not determine cgroup setup")
	}

	if !unified {
		return nil
	}

	cgroupBase, err := rt.ContainerCGroupPath(ctx, wscontainerID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("cannot find workspace container CGroup path")
		return status.Errorf(codes.NotFound, "cannot find workspace container cgroup")
	}

	err = evacuateToCGroup(ctx, wbs.CGroupMountPoint, cgroupBase, "workspace")
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("cannot create workspace cgroup")
		return status.Errorf(codes.FailedPrecondition, "cannot create workspace cgroup")
	}

	return nil
}

func (wbs *InWorkspaceServiceServer) SetupPairVeths(ctx context.Context, req *api.SetupPairVethsRequest) (*api.SetupPairVethsResponse, error) {
	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("SetupPairVeths: cannot find workspace container")
		return nil, status.Errorf(codes.Internal, "cannot find workspace container")
	}

	containerPID, err := rt.ContainerPID(ctx, wscontainerID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("SetupPairVeths: cannot find workspace container PID")
		return nil, status.Errorf(codes.Internal, "cannnot setup a pair of veths")
	}

	err = nsi.Nsinsider(wbs.Session.InstanceID, int(containerPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "setup-pair-veths", "--target-pid", strconv.Itoa(int(req.Pid)))
	}, nsi.EnterMountNS(true), nsi.EnterPidNS(true), nsi.EnterNetNS(true))
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("SetupPairVeths: cannot setup a pair of veths")
		return nil, status.Errorf(codes.Internal, "cannot setup a pair of veths")
	}

	pid, err := wbs.Uidmapper.findHostPID(containerPID, uint64(req.Pid))
	if err != nil {
		return nil, xerrors.Errorf("cannot map in-container PID %d (container PID: %d): %w", req.Pid, containerPID, err)
	}
	err = nsi.Nsinsider(wbs.Session.InstanceID, int(pid), func(c *exec.Cmd) {
		c.Args = append(c.Args, "setup-peer-veth")
	}, nsi.EnterMountNS(true), nsi.EnterPidNS(true), nsi.EnterNetNS(true))
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("SetupPairVeths: cannot setup a peer veths")
		return nil, status.Errorf(codes.Internal, "cannot setup a peer veths")
	}

	err = nsi.Nsinsider(wbs.Session.InstanceID, int(containerPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "enable-ip-forward")
	}, nsi.EnterNetNS(true), nsi.EnterMountNSPid(1))
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("SetupPairVeths: cannot enable IP forwarding")
		return nil, status.Errorf(codes.Internal, "cannot enable IP forwarding")
	}

	return &api.SetupPairVethsResponse{}, nil
}

func evacuateToCGroup(ctx context.Context, mountpoint, oldGroup, child string) error {
	newGroup := filepath.Join(oldGroup, child)
	oldPath := filepath.Join(mountpoint, oldGroup)
	newPath := filepath.Join(mountpoint, newGroup)

	if err := os.MkdirAll(newPath, 0755); err != nil {
		return err
	}

	// evacuate existing procs from oldGroup to newGroup, so that we can enable all controllers including threaded ones
	cgroupProcsBytes, err := os.ReadFile(filepath.Join(oldPath, "cgroup.procs"))
	if err != nil {
		return err
	}
	for _, pidStr := range strings.Split(string(cgroupProcsBytes), "\n") {
		if pidStr == "" || pidStr == "0" {
			continue
		}
		if err := os.WriteFile(filepath.Join(newPath, "cgroup.procs"), []byte(pidStr), 0644); err != nil {
			log.WithError(err).Warnf("failed to move process %s to cgroup %q", pidStr, newGroup)
		}
	}

	// enable controllers for all subgroups under the oldGroup
	controllerBytes, err := os.ReadFile(filepath.Join(oldPath, "cgroup.controllers"))
	if err != nil {
		return err
	}
	for _, controller := range strings.Fields(string(controllerBytes)) {
		log.Debugf("enabling controller %q", controller)
		if err := os.WriteFile(filepath.Join(oldPath, "cgroup.subtree_control"), []byte("+"+controller), 0644); err != nil {
			log.WithError(err).Warnf("failed to enable controller %q", controller)
		}
	}

	return nil
}

// MountProc mounts a proc filesystem
func (wbs *InWorkspaceServiceServer) MountProc(ctx context.Context, req *api.MountProcRequest) (resp *api.MountProcResponse, err error) {
	var (
		reqPID  = req.Pid
		procPID uint64
	)
	defer func() {
		if err == nil {
			return
		}

		log.WithError(err).WithField("procPID", procPID).WithField("reqPID", reqPID).WithFields(wbs.Session.OWI()).Error("cannot mount proc")
		if _, ok := status.FromError(err); !ok {
			err = status.Error(codes.Internal, "cannot mount proc")
		}
	}()

	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find workspace container")
	}

	containerPID, err := rt.ContainerPID(ctx, wscontainerID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find container PID for containerID %v: %w", wscontainerID, err)
	}

	procPID, err = wbs.Uidmapper.findHostPID(containerPID, uint64(req.Pid))
	if err != nil {
		return nil, xerrors.Errorf("cannot map in-container PID %d (container PID: %d): %w", req.Pid, containerPID, err)
	}

	nodeStaging, err := os.MkdirTemp("", "proc-staging")
	if err != nil {
		return nil, xerrors.Errorf("cannot prepare proc staging: %w", err)
	}
	err = nsi.Nsinsider(wbs.Session.InstanceID, int(procPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "mount-proc", "--target", nodeStaging)
	}, nsi.EnterMountNS(false), nsi.EnterPidNS(true), nsi.EnterNetNS(true))
	if err != nil {
		return nil, xerrors.Errorf("mount new proc at %s: %w", nodeStaging, err)
	}

	for _, mask := range procDefaultMaskedPaths {
		err = maskPath(filepath.Join(nodeStaging, mask))
		if err != nil {
			return nil, xerrors.Errorf("cannot mask %s: %w", mask, err)
		}
	}
	for _, rdonly := range procDefaultReadonlyPaths {
		err = readonlyPath(filepath.Join(nodeStaging, rdonly))
		if err != nil {
			return nil, xerrors.Errorf("cannot mount readonly %s: %w", rdonly, err)
		}
	}

	err = moveMount(wbs.Session.InstanceID, int(procPID), nodeStaging, req.Target)
	if err != nil {
		return nil, err
	}

	// now that we've moved the mount (which we've done with OPEN_TREE_CLONE), we'll
	// need to unmount the mask mounts again to not leave them dangling.
	var masks []string
	masks = append(masks, procDefaultMaskedPaths...)
	masks = append(masks, procDefaultReadonlyPaths...)
	cleanupMaskedMount(wbs.Session.OWI(), nodeStaging, masks)

	return &api.MountProcResponse{}, nil
}

// MountProc mounts a proc filesystem
func (wbs *InWorkspaceServiceServer) UmountProc(ctx context.Context, req *api.UmountProcRequest) (resp *api.UmountProcResponse, err error) {
	var (
		reqPID  = req.Pid
		procPID uint64
	)
	defer func() {
		if err == nil {
			return
		}

		log.WithError(err).WithField("procPID", procPID).WithField("reqPID", reqPID).Error("UmountProc failed")
		if _, ok := status.FromError(err); !ok {
			err = status.Error(codes.Internal, "cannot umount proc")
		}
	}()

	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find workspace container")
	}

	containerPID, err := rt.ContainerPID(ctx, wscontainerID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find container PID for containerID %v: %w", wscontainerID, err)
	}

	procPID, err = wbs.Uidmapper.findHostPID(containerPID, uint64(req.Pid))
	if err != nil {
		return nil, xerrors.Errorf("cannot map in-container PID %d (container PID: %d): %w", req.Pid, containerPID, err)
	}

	nodeStaging, err := os.MkdirTemp("", "proc-umount")
	if err != nil {
		return nil, xerrors.Errorf("cannot prepare proc staging: %w", err)
	}
	scktPath := filepath.Join(nodeStaging, "sckt")
	l, err := net.Listen("unix", scktPath)
	if err != nil {
		return nil, xerrors.Errorf("cannot listen for mountfd: %w", err)
	}
	defer l.Close()

	type fdresp struct {
		FD  int
		Err error
	}
	fdrecv := make(chan fdresp)
	go func() {
		defer close(fdrecv)

		rconn, err := l.Accept()
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}
		defer rconn.Close()

		conn := rconn.(*net.UnixConn)
		err = conn.SetDeadline(time.Now().Add(5 * time.Second))
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}

		f, err := conn.File()
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}
		defer f.Close()
		connfd := int(f.Fd())

		buf := make([]byte, unix.CmsgSpace(4))
		_, _, _, _, err = unix.Recvmsg(connfd, nil, buf, 0)
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}

		msgs, err := unix.ParseSocketControlMessage(buf)
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}
		if len(msgs) != 1 {
			fdrecv <- fdresp{Err: xerrors.Errorf("expected a single socket control message")}
			return
		}

		fds, err := unix.ParseUnixRights(&msgs[0])
		if err != nil {
			fdrecv <- fdresp{Err: err}
			return
		}
		if len(fds) == 0 {
			fdrecv <- fdresp{Err: xerrors.Errorf("expected a single socket FD")}
			return
		}

		fdrecv <- fdresp{FD: fds[0]}
	}()

	rconn, err := net.Dial("unix", scktPath)
	if err != nil {
		return nil, err
	}
	defer rconn.Close()
	conn := rconn.(*net.UnixConn)
	connFD, err := conn.File()
	if err != nil {
		return nil, err
	}

	err = nsi.Nsinsider(wbs.Session.InstanceID, int(procPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "open-tree", "--target", req.Target, "--pipe-fd", "3")
		c.ExtraFiles = append(c.ExtraFiles, connFD)
	})
	if err != nil {
		return nil, xerrors.Errorf("cannot open-tree at %s (container PID: %d): %w", req.Target, containerPID, err)
	}

	fdr := <-fdrecv
	if fdr.Err != nil {
		return nil, fdr.Err
	}
	if fdr.FD == 0 {
		return nil, xerrors.Errorf("received nil as mountfd (container PID: %d): %w", containerPID, err)
	}

	base, err := os.Executable()
	if err != nil {
		return nil, err
	}

	cmd := exec.Command(filepath.Join(filepath.Dir(base), "nsinsider"), "move-mount", "--target", nodeStaging, "--pipe-fd", "3")
	cmd.ExtraFiles = append(cmd.ExtraFiles, os.NewFile(uintptr(fdr.FD), ""))
	cmd.SysProcAttr = &syscall.SysProcAttr{
		Unshareflags: syscall.CLONE_NEWNS,
	}
	out, err := cmd.CombinedOutput()
	if err != nil {
		return nil, xerrors.Errorf("cannot move-mount: %w: %s", err, string(out))
	}

	return &api.UmountProcResponse{}, nil
}

func (wbs *InWorkspaceServiceServer) MountSysfs(ctx context.Context, req *api.MountProcRequest) (resp *api.MountProcResponse, err error) {
	var (
		reqPID  = req.Pid
		procPID uint64
	)
	defer func() {
		if err == nil {
			return
		}

		log.WithError(err).WithField("procPID", procPID).WithField("reqPID", reqPID).WithFields(wbs.Session.OWI()).Error("cannot mount sysfs")
		if _, ok := status.FromError(err); !ok {
			err = status.Error(codes.Internal, "cannot mount sysfs")
		}
	}()

	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find workspace container")
	}

	containerPID, err := rt.ContainerPID(ctx, wscontainerID)
	if err != nil {
		return nil, xerrors.Errorf("cannot find container PID for containerID %v: %w", wscontainerID, err)
	}

	procPID, err = wbs.Uidmapper.findHostPID(containerPID, uint64(req.Pid))
	if err != nil {
		return nil, xerrors.Errorf("cannot map in-container PID %d (container PID: %d): %w", req.Pid, containerPID, err)
	}

	nodeStaging, err := os.MkdirTemp("", "sysfs-staging")
	if err != nil {
		return nil, xerrors.Errorf("cannot prepare proc staging: %w", err)
	}
	err = nsi.Nsinsider(wbs.Session.InstanceID, int(procPID), func(c *exec.Cmd) {
		c.Args = append(c.Args, "mount-sysfs", "--target", nodeStaging)
	}, nsi.EnterMountNS(false), nsi.EnterNetNS(true))
	if err != nil {
		return nil, xerrors.Errorf("mount new sysfs at %s: %w", nodeStaging, err)
	}

	for _, mask := range sysfsDefaultMaskedPaths {
		err = maskPath(filepath.Join(nodeStaging, mask))
		if err != nil {
			return nil, xerrors.Errorf("cannot mask %s: %w", mask, err)
		}
	}

	err = moveMount(wbs.Session.InstanceID, int(procPID), nodeStaging, req.Target)
	if err != nil {
		return nil, err
	}

	cleanupMaskedMount(wbs.Session.OWI(), nodeStaging, sysfsDefaultMaskedPaths)

	return &api.MountProcResponse{}, nil
}

func moveMount(instanceID string, targetPid int, source, target string) error {
	mntfd, err := syscallOpenTree(unix.AT_FDCWD, source, flagOpenTreeClone|flagAtRecursive)
	if err != nil {
		return xerrors.Errorf("cannot open tree: %w", err)
	}
	mntf := os.NewFile(mntfd, "")
	defer mntf.Close()

	// Note(cw): we also need to enter the target PID namespace because the mount target
	// 			 might refer to proc.
	err = nsi.Nsinsider(instanceID, targetPid, func(c *exec.Cmd) {
		c.Args = append(c.Args, "move-mount", "--target", target, "--pipe-fd", "3")
		c.ExtraFiles = append(c.ExtraFiles, mntf)
	}, nsi.EnterPidNS(true))
	if err != nil {
		return xerrors.Errorf("cannot move mount: %w", err)
	}
	return nil
}

// cleanupMaskedMount will unmount and remove the paths joined with the basedir.
// Errors are logged instead of returned.
// This is useful for when we've moved the mount (which we've done with OPEN_TREE_CLONE), we'll
// need to unmount the mask mounts again to not leave them dangling.
func cleanupMaskedMount(owi map[string]interface{}, base string, paths []string) {
	for _, mask := range paths {
		// Note: if errors happen while unmounting or removing the masks this does not mean
		//       that the final unmount won't happen. I.e. we can ignore those errors here
		//       because they would not be actionable anyways. Only if the final removal or
		//       unmount fails did we leak a mount.

		fn := filepath.Join(base, mask)
		err := unix.Unmount(fn, 0)
		if err != nil {
			continue
		}
		_ = os.RemoveAll(fn)
	}

	err := unix.Unmount(base, 0)
	if err != nil {
		log.WithError(err).WithField("fn", base).WithFields(owi).Warn("cannot unmount dangling base mount")
		err = unix.Unmount(base, syscall.MNT_DETACH)
		if err != nil {
			log.WithError(err).WithField("fn", base).WithFields(owi).Warn("cannot detach dangling base mount")
		}
		return
	}

	err = os.RemoveAll(base)
	if err != nil {
		log.WithError(err).WithField("fn", base).WithFields(owi).Warn("cannot remove dangling base mount")
		return
	}
}

// maskPath masks the top of the specified path inside a container to avoid
// security issues from processes reading information from non-namespace aware
// mounts ( proc/kcore ).
// For files, maskPath bind mounts /dev/null over the top of the specified path.
// For directories, maskPath mounts read-only tmpfs over the top of the specified path.
//
// Blatant copy from runc: https://github.com/opencontainers/runc/blob/master/libcontainer/rootfs_linux.go#L946-L959
func maskPath(path string) error {
	if err := unix.Mount("/dev/null", path, "", unix.MS_BIND, ""); err != nil && !errors.Is(err, fs.ErrNotExist) {
		if err == unix.ENOTDIR {
			return unix.Mount("tmpfs", path, "tmpfs", unix.MS_RDONLY, "")
		}
		return err
	}
	return nil
}

// readonlyPath will make a path read only.
//
// Blatant copy from runc: https://github.com/opencontainers/runc/blob/master/libcontainer/rootfs_linux.go#L907-L916
func readonlyPath(path string) error {
	if err := unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REC, ""); err != nil {
		if errors.Is(err, fs.ErrNotExist) {
			return nil
		}
		return err
	}
	return unix.Mount(path, path, "", unix.MS_BIND|unix.MS_REMOUNT|unix.MS_RDONLY|unix.MS_REC, "")
}

// WriteIDMapping writes /proc/.../uid_map and /proc/.../gid_map for a workapce container
func (wbs *InWorkspaceServiceServer) WriteIDMapping(ctx context.Context, req *api.WriteIDMappingRequest) (*api.WriteIDMappingResponse, error) {
	cid, err := wbs.Uidmapper.Runtime.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		log.WithFields(wbs.Session.OWI()).WithError(err).Error("cannot write ID mapping, because we cannot find the container")
		return nil, status.Error(codes.FailedPrecondition, "cannot find container")
	}

	err = wbs.Uidmapper.HandleUIDMappingRequest(ctx, req, cid, wbs.Session.InstanceID)
	if err != nil {
		return nil, err
	}

	return &api.WriteIDMappingResponse{}, nil
}

// Allow workspace users to manipulate the cgroups to which the user process belong by constructing the cgroups of the following form
//
// <container-cgorup>  drwxr-xr-x 3 root      root
// └── workspace       drwxr-xr-x 5 gitpodUid gitpodGid
//
//	└── user        drwxr-xr-x 5 gitpodUid gitpodGid
func (wbs *InWorkspaceServiceServer) EvacuateCGroup(ctx context.Context, req *api.EvacuateCGroupRequest) (*api.EvacuateCGroupResponse, error) {
	unified, err := cgroups.IsUnifiedCgroupSetup()
	if err != nil {
		// log error and do not expose it to the user
		log.WithError(err).Error("could not determine cgroup setup")
		return nil, status.Errorf(codes.FailedPrecondition, "could not determine cgroup setup")
	}
	if !unified {
		return &api.EvacuateCGroupResponse{}, nil
	}

	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("EvacuateCGroup: cannot find workspace container")
		return nil, status.Errorf(codes.NotFound, "cannot find workspace container")
	}

	cgroupBase, err := rt.ContainerCGroupPath(ctx, wscontainerID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("EvacuateCGroup: cannot find workspace container CGroup path")
		return nil, status.Errorf(codes.NotFound, "cannot find workspace container cgroup")
	}

	workspaceCGroup := filepath.Join(cgroupBase, "workspace")
	if _, err := os.Stat(filepath.Join(wbs.CGroupMountPoint, workspaceCGroup)); err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).WithField("path", workspaceCGroup).Error("EvacuateCGroup: workspace cgroup error")
		return nil, status.Errorf(codes.FailedPrecondition, "cannot find workspace cgroup")
	}

	err = evacuateToCGroup(ctx, wbs.CGroupMountPoint, workspaceCGroup, "user")
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).WithField("path", workspaceCGroup).Error("EvacuateCGroup: cannot produce user cgroup")
		return nil, status.Errorf(codes.FailedPrecondition, "cannot produce user cgroup")
	}

	out, err := exec.CommandContext(ctx, "chown", "-R", fmt.Sprintf("%d:%d", wsinit.GitpodUID, wsinit.GitpodGID), filepath.Join(wbs.CGroupMountPoint, workspaceCGroup)).CombinedOutput()
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).WithField("path", workspaceCGroup).WithField("out", string(out)).Error("EvacuateCGroup: cannot chown workspace cgroup")
		return nil, status.Errorf(codes.FailedPrecondition, "cannot chown workspace cgroup")
	}

	return &api.EvacuateCGroupResponse{}, nil
}

// Teardown triggers the final liev backup and possibly shiftfs mark unmount
func (wbs *InWorkspaceServiceServer) Teardown(ctx context.Context, req *api.TeardownRequest) (*api.TeardownResponse, error) {
	owi := wbs.Session.OWI()

	var (
		success = true
		err     error
	)

	err = wbs.unPrepareForUserNS()
	if err != nil {
		log.WithError(err).WithFields(owi).Error("mark FS unmount failed")
		success = false
	}

	return &api.TeardownResponse{Success: success}, nil
}

func (wbs *InWorkspaceServiceServer) unPrepareForUserNS() error {
	mountpoint := filepath.Join(wbs.Session.ServiceLocNode, "mark")
	err := nsi.Nsinsider(wbs.Session.InstanceID, 1, func(c *exec.Cmd) {
		c.Args = append(c.Args, "unmount", "--target", mountpoint)
	})
	if err != nil {
		return xerrors.Errorf("cannot unmount mark at %s: %w", mountpoint, err)
	}

	return nil
}

func (wbs *InWorkspaceServiceServer) WorkspaceInfo(ctx context.Context, req *api.WorkspaceInfoRequest) (*api.WorkspaceInfoResponse, error) {
	log.Debug("Received workspace info request")
	rt := wbs.Uidmapper.Runtime
	if rt == nil {
		return nil, status.Errorf(codes.FailedPrecondition, "not connected to container runtime")
	}
	wscontainerID, err := rt.WaitForContainer(ctx, wbs.Session.InstanceID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("EvacuateCGroup: cannot find workspace container")
		return nil, status.Errorf(codes.NotFound, "cannot find workspace container")
	}

	cgroupPath, err := rt.ContainerCGroupPath(ctx, wscontainerID)
	if err != nil {
		log.WithError(err).WithFields(wbs.Session.OWI()).Error("EvacuateCGroup: cannot find workspace container CGroup path")
		return nil, status.Errorf(codes.NotFound, "cannot find workspace container cgroup")
	}

	unified, err := cgroups.IsUnifiedCgroupSetup()
	if err != nil {
		// log error and do not expose it to the user
		log.WithError(err).Error("could not determine cgroup setup")
		return nil, status.Errorf(codes.FailedPrecondition, "could not determine cgroup setup")
	}

	resources, err := getWorkspaceResourceInfo(wbs.CGroupMountPoint, cgroupPath, unified)
	if err != nil {
		if !errors.Is(err, os.ErrNotExist) {
			log.WithError(err).Error("could not get resource information")
		}
		return nil, status.Error(codes.Unknown, err.Error())
	}

	return &api.WorkspaceInfoResponse{
		Resources: resources,
	}, nil
}

func getWorkspaceResourceInfo(mountPoint, cgroupPath string, unified bool) (*api.Resources, error) {
	if unified {
		cpu, err := getCpuResourceInfoV2(mountPoint, cgroupPath)
		if err != nil {
			return nil, err
		}

		memory, err := getMemoryResourceInfoV2(mountPoint, cgroupPath)
		if err != nil {
			return nil, err
		}

		return &api.Resources{
			Cpu:    cpu,
			Memory: memory,
		}, nil
	} else {
		cpu, err := getCpuResourceInfoV1(mountPoint, cgroupPath)
		if err != nil {
			return nil, err
		}

		memory, err := getMemoryResourceInfoV1(mountPoint, cgroupPath)
		if err != nil {
			return nil, err
		}

		return &api.Resources{
			Cpu:    cpu,
			Memory: memory,
		}, nil
	}
}

func getCpuResourceInfoV2(mountPoint, cgroupPath string) (*api.Cpu, error) {
	cpu := v2.NewCpuControllerWithMount(mountPoint, cgroupPath)

	t, err := resolveCPUStatV2(cpu)
	if err != nil {
		return nil, err
	}

	time.Sleep(time.Second)

	t2, err := resolveCPUStatV2(cpu)
	if err != nil {
		return nil, err
	}

	cpuUsage := t2.usage - t.usage
	totalTime := t2.uptime - t.uptime
	used := cpuUsage / totalTime * 1000

	quota, period, err := cpu.Max()
	if errors.Is(err, os.ErrNotExist) {
		quota = math.MaxUint64
	} else if err != nil {
		return nil, err
	}

	// if no cpu limit has been specified, use the number of cores
	var limit uint64
	if quota == math.MaxUint64 {
		// TODO(toru): we have to check a parent cgroup instead of a host resources
		cpuInfo, err := linuxproc.ReadCPUInfo("/proc/cpuinfo")
		if err != nil {
			return nil, err
		}

		limit = uint64(cpuInfo.NumCore()) * 1000
	} else {
		limit = quota / period * 1000
	}

	return &api.Cpu{
		Used:  int64(used),
		Limit: int64(limit),
	}, nil
}

func getMemoryResourceInfoV2(mountPoint, cgroupPath string) (*api.Memory, error) {
	memory := v2.NewMemoryControllerWithMount(mountPoint, cgroupPath)
	memoryLimit, err := memory.Max()
	if err != nil {
		return nil, xerrors.Errorf("could not retrieve memory max: %w", err)
	}

	memInfo, err := linuxproc.ReadMemInfo("/proc/meminfo")
	if err != nil {
		return nil, xerrors.Errorf("failed to read meminfo: %w", err)
	}

	// if no memory limit has been specified, use total available memory
	if memoryLimit == math.MaxUint64 || memoryLimit > memInfo.MemTotal*1024 {
		// total memory is specifed on kilobytes -> convert to bytes
		memoryLimit = memInfo.MemTotal * 1024
	}

	usedMemory, err := memory.Current()
	if err != nil {
		return nil, xerrors.Errorf("failed to read current memory usage: %w", err)
	}

	stats, err := memory.Stat()
	if err != nil {
		return nil, xerrors.Errorf("failed to read memory stats: %w", err)
	}

	if stats.InactiveFileTotal > 0 {
		if usedMemory < stats.InactiveFileTotal {
			usedMemory = 0
		} else {
			usedMemory -= stats.InactiveFileTotal
		}
	}

	return &api.Memory{
		Limit: int64(memoryLimit),
		Used:  int64(usedMemory),
	}, nil
}

func getMemoryResourceInfoV1(mountPoint, cgroupPath string) (*api.Memory, error) {
	memory := v1.NewMemoryControllerWithMount(mountPoint, cgroupPath)

	memoryLimit, err := memory.Limit()
	if err != nil {
		return nil, err
	}

	memInfo, err := linuxproc.ReadMemInfo("/proc/meminfo")
	if err != nil {
		return nil, xerrors.Errorf("failed to read meminfo: %w", err)
	}

	// if no memory limit has been specified, use total available memory
	if memoryLimit == math.MaxUint64 || memoryLimit > memInfo.MemTotal*1024 {
		// total memory is specifed on kilobytes -> convert to bytes
		memoryLimit = memInfo.MemTotal * 1024
	}

	usedMemory, err := memory.Usage()
	if err != nil {
		return nil, xerrors.Errorf("failed to read memory limit: %w", err)
	}

	stats, err := memory.Stat()
	if err != nil {
		return nil, xerrors.Errorf("failed to read memory stats: %w", err)
	}

	if stats.InactiveFileTotal > 0 {
		if usedMemory < stats.InactiveFileTotal {
			usedMemory = 0
		} else {
			usedMemory -= stats.InactiveFileTotal
		}
	}

	return &api.Memory{
		Limit: int64(memoryLimit),
		Used:  int64(usedMemory),
	}, nil
}

func getCpuResourceInfoV1(mountPoint, cgroupPath string) (*api.Cpu, error) {
	cpu := v1.NewCpuControllerWithMount(mountPoint, cgroupPath)

	t, err := resolveCPUStatV1(cpu)
	if err != nil {
		return nil, err
	}

	time.Sleep(time.Second)

	t2, err := resolveCPUStatV1(cpu)
	if err != nil {
		return nil, err
	}

	cpuUsage := t2.usage - t.usage
	totalTime := t2.uptime - t.uptime
	used := cpuUsage / totalTime * 1000

	quota, err := cpu.Quota()
	if err != nil {
		return nil, err
	}

	// if no cpu limit has been specified, use the number of cores
	var limit uint64
	if quota == math.MaxUint64 {
		content, err := os.ReadFile(filepath.Join(mountPoint, "cpu", cgroupPath, "cpuacct.usage_percpu"))
		if err != nil {
			return nil, xerrors.Errorf("failed to read cpuacct.usage_percpu: %w", err)
		}
		limit = uint64(len(strings.Split(strings.TrimSpace(string(content)), " "))) * 1000
	} else {
		period, err := cpu.Period()
		if err != nil {
			return nil, err
		}

		limit = quota / period * 1000
	}

	return &api.Cpu{
		Used:  int64(used),
		Limit: int64(limit),
	}, nil
}

type cpuStat struct {
	usage  float64
	uptime float64
}

func resolveCPUStatV1(cpu *v1.Cpu) (*cpuStat, error) {
	usage_ns, err := cpu.Usage()
	if err != nil {
		return nil, xerrors.Errorf("failed to get cpu usage: %w", err)
	}

	// convert from nanoseconds to seconds
	usage := float64(usage_ns) * 1e-9
	uptime, err := readProcUptime()
	if err != nil {
		return nil, err
	}

	return &cpuStat{
		usage:  usage,
		uptime: uptime,
	}, nil
}

func resolveCPUStatV2(cpu *v2.Cpu) (*cpuStat, error) {
	stats, err := cpu.Stat()
	if err != nil {
		return nil, xerrors.Errorf("failed to get cpu usage: %w", err)
	}

	usage := float64(stats.UsageTotal) * 1e-6
	uptime, err := readProcUptime()
	if err != nil {
		return nil, err
	}

	return &cpuStat{
		usage:  usage,
		uptime: uptime,
	}, nil
}

func readProcUptime() (float64, error) {
	content, err := os.ReadFile("/proc/uptime")
	if err != nil {
		return 0, xerrors.Errorf("failed to read uptime: %w", err)
	}
	values := strings.Split(strings.TrimSpace(string(content)), " ")
	uptime, err := strconv.ParseFloat(values[0], 64)
	if err != nil {
		return 0, xerrors.Errorf("failed to parse uptime: %w", err)
	}

	return uptime, nil
}

type ratelimitingInterceptor map[string]ratelimit

type ratelimit struct {
	Limiter *rate.Limiter
	UseOnce bool
}

func (rli ratelimitingInterceptor) UnaryInterceptor() grpc.UnaryServerInterceptor {
	var (
		mu   sync.Mutex
		used = make(map[string]struct{})
	)
	return func(ctx context.Context, req interface{}, info *grpc.UnaryServerInfo, handler grpc.UnaryHandler) (resp interface{}, err error) {
		limit, ok := rli[info.FullMethod]
		if ok {
			if limit.UseOnce {
				mu.Lock()
				_, ran := used[info.FullMethod]
				used[info.FullMethod] = struct{}{}
				mu.Unlock()

				if ran {
					return nil, status.Error(codes.ResourceExhausted, "can be used only once")
				}
			}

			if limit.Limiter != nil && !limit.Limiter.Allow() {
				return nil, status.Error(codes.ResourceExhausted, "too many requests")
			}
		}

		return handler(ctx, req)
	}
}