2021-04-22 14:20:12 +02:00

719 lines
19 KiB
Go

// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package cmd
import (
"context"
"fmt"
"net"
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"strings"
"syscall"
"time"
"github.com/rootless-containers/rootlesskit/pkg/msgutil"
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/spf13/cobra"
"golang.org/x/sys/unix"
"google.golang.org/grpc"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/workspacekit/pkg/lift"
"github.com/gitpod-io/gitpod/workspacekit/pkg/seccomp"
"github.com/gitpod-io/gitpod/ws-daemon/api"
daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api"
)
const (
// ring1ShutdownTimeout is the time ring1 gets between SIGTERM and SIGKILL.
// We do this to ensure we have enough time left for ring0 to clean up prior
// to receiving SIGKILL from the kubelet.
//
// This time must give ring1 enough time to shut down (see time budgets in supervisor.go),
// and to talk to ws-daemon within the terminationGracePeriod of the workspace pod.
ring1ShutdownTimeout = 20 * time.Second
// ring2StartupTimeout is the maximum time we wait between starting ring2 and its
// attempt to connect to the parent socket.
ring2StartupTimeout = 5 * time.Second
)
var ring0Cmd = &cobra.Command{
Use: "ring0",
Short: "starts ring0 - enter here",
Run: func(_ *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, true)
log := log.WithField("ring", 0)
var failed bool
defer func() {
if !failed {
return
}
sleepForDebugging()
}()
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
client, conn, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
defer conn.Close()
prep, err := client.PrepareForUserNS(ctx, &daemonapi.PrepareForUserNSRequest{})
if err != nil {
log.WithError(err).Fatal("cannot prepare for user namespaces")
return
}
defer func() {
ctx, cancel = context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
_, err = client.Teardown(ctx, &daemonapi.TeardownRequest{})
if err != nil {
log.WithError(err).Error("cannot trigger teardown")
failed = true
return
}
}()
cmd := exec.Command("/proc/self/exe", "ring1")
cmd.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
}
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = append(os.Environ(), "WORKSPACEKIT_FSSHIFT="+prep.FsShift.String())
if err := cmd.Start(); err != nil {
log.WithError(err).Error("failed to start ring0")
failed = true
return
}
sigc := make(chan os.Signal, 128)
signal.Notify(sigc)
go func() {
defer func() {
// This is a 'just in case' fallback, in case we're racing the cmd.Process and it's become
// nil in the time since we checked.
err := recover()
if err != nil {
log.WithField("recovered", err).Error("recovered from panic")
}
}()
for {
sig := <-sigc
if sig != unix.SIGTERM {
cmd.Process.Signal(sig)
continue
}
cmd.Process.Signal(unix.SIGTERM)
time.Sleep(ring1ShutdownTimeout)
if cmd.Process == nil {
return
}
log.Warn("ring1 did not shut down in time - sending sigkill")
err = cmd.Process.Kill()
if err != nil {
log.WithError(err).Error("cannot kill ring1")
}
return
}
}()
err = cmd.Wait()
if eerr, ok := err.(*exec.ExitError); ok {
state, ok := eerr.ProcessState.Sys().(syscall.WaitStatus)
if ok && state.Signal() == syscall.SIGKILL {
log.Warn("ring1 was killed")
return
}
}
if err != nil {
log.WithError(err).Error("unexpected exit")
failed = true
return
}
},
}
var ring1Opts struct {
MappingEstablished bool
}
var ring1Cmd = &cobra.Command{
Use: "ring1",
Short: "starts ring1",
Run: func(_cmd *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, true)
log := log.WithField("ring", 1)
defer log.Info("done")
var failed bool
defer func() {
if !failed {
return
}
sleepForDebugging()
}()
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
client, conn, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
failed = true
return
}
defer conn.Close()
mapping := []*daemonapi.WriteIDMappingRequest_Mapping{
{ContainerId: 0, HostId: 33333, Size: 1},
{ContainerId: 1, HostId: 100000, Size: 65534},
}
if !ring1Opts.MappingEstablished {
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: false, Mapping: mapping})
if err != nil {
log.WithError(err).Error("cannot establish UID mapping")
failed = true
return
}
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: true, Mapping: mapping})
if err != nil {
log.WithError(err).Error("cannot establish GID mapping")
failed = true
return
}
err = syscall.Exec("/proc/self/exe", append(os.Args, "--mapping-established"), os.Environ())
if err != nil {
log.WithError(err).Error("cannot exec /proc/self/exe")
failed = true
return
}
return
}
// The parent calls child with Pdeathsig, but it is cleared when the UID/GID mapping is written.
// (see also https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646).
//
// (cw) I have been able to reproduce this issue without newuidmap/newgidmap.
// See https://gist.github.com/csweichel/3fc9d4b0752367d4a436f969c8685c06
runtime.LockOSThread()
unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
runtime.UnlockOSThread()
ring2Root, err := os.MkdirTemp("", "supervisor")
if err != nil {
log.WithError(err).Fatal("cannot create tempdir")
}
var fsshift api.FSShiftMethod
if v, ok := api.FSShiftMethod_value[os.Getenv("WORKSPACEKIT_FSSHIFT")]; !ok {
log.WithField("fsshift", os.Getenv("WORKSPACEKIT_FSSHIFT")).Fatal("unknown FS shift method")
} else {
fsshift = api.FSShiftMethod(v)
}
type mnte struct {
Target string
Source string
FSType string
Flags uintptr
}
var mnts []mnte
switch fsshift {
case api.FSShiftMethod_FUSE:
mnts = append(mnts,
mnte{Target: "/", Source: "/.workspace/mark", Flags: unix.MS_BIND | unix.MS_REC},
)
case api.FSShiftMethod_SHIFTFS:
mnts = append(mnts,
mnte{Target: "/", Source: "/.workspace/mark", FSType: "shiftfs"},
)
default:
log.WithField("fsshift", fsshift).Fatal("unknown FS shift method")
}
mnts = append(mnts,
mnte{Target: "/sys", Flags: unix.MS_BIND | unix.MS_REC},
mnte{Target: "/dev", Flags: unix.MS_BIND | unix.MS_REC},
// TODO(cw): only mount /workspace if it's in the mount table, i.e. this isn't an FWB workspace
mnte{Target: "/workspace", Flags: unix.MS_BIND | unix.MS_REC},
mnte{Target: "/etc/hosts", Flags: unix.MS_BIND | unix.MS_REC},
mnte{Target: "/etc/hostname", Flags: unix.MS_BIND | unix.MS_REC},
mnte{Target: "/etc/resolv.conf", Flags: unix.MS_BIND | unix.MS_REC},
mnte{Target: "/tmp", Source: "tmpfs", FSType: "tmpfs"},
)
for _, m := range mnts {
dst := filepath.Join(ring2Root, m.Target)
_ = os.MkdirAll(dst, 0644)
if m.Source == "" {
m.Source = m.Target
}
if m.FSType == "" {
m.FSType = "none"
}
log.WithFields(map[string]interface{}{
"source": m.Source,
"target": dst,
"fstype": m.FSType,
"flags": m.Flags,
}).Debug("mounting new rootfs")
err = unix.Mount(m.Source, dst, m.FSType, m.Flags, "")
if err != nil {
log.WithError(err).WithField("dest", dst).Error("cannot establish mount")
failed = true
return
}
}
env := make([]string, 0, len(os.Environ()))
for _, e := range os.Environ() {
if strings.HasPrefix(e, "WORKSPACEKIT_") {
continue
}
env = append(env, e)
}
socketFN := filepath.Join(os.TempDir(), fmt.Sprintf("workspacekit-ring1-%d.unix", time.Now().UnixNano()))
skt, err := net.Listen("unix", socketFN)
if err != nil {
log.WithError(err).Error("cannot create socket for ring2")
failed = true
return
}
defer skt.Close()
cmd := exec.Command("/proc/self/exe", "ring2", socketFN)
cmd.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
Cloneflags: syscall.CLONE_NEWNS | syscall.CLONE_NEWPID,
}
cmd.Dir = ring2Root
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = env
if err := cmd.Start(); err != nil {
log.WithError(err).Error("failed to start the child process")
failed = true
return
}
sigc := sigproxy.ForwardAllSignals(context.Background(), cmd.Process.Pid)
defer sigproxysignal.StopCatch(sigc)
procLoc := filepath.Join(ring2Root, "proc")
err = os.MkdirAll(procLoc, 0755)
if err != nil {
log.WithError(err).Error("cannot mount proc")
failed = true
return
}
_, err = client.MountProc(ctx, &daemonapi.MountProcRequest{
Target: procLoc,
Pid: int64(cmd.Process.Pid),
})
if err != nil {
log.WithError(err).Error("cannot mount proc")
failed = true
return
}
// We have to wait for ring2 to come back to us and connect to the socket we've passed along.
// There's a chance that ring2 crashes or misbehaves, so we don't want to wait forever, hence
// the someone complicated "accept" logic below.
// If there's a deadline that can be set somewhere that we've missed, we should be using that
// one instead.
incoming := make(chan net.Conn, 1)
errc := make(chan error, 1)
go func() {
defer close(incoming)
defer close(errc)
// Accept stops the latest when we close the socket.
c, err := skt.Accept()
if err != nil {
errc <- err
return
}
incoming <- c
}()
var ring2Conn *net.UnixConn
for {
var brek bool
select {
case err = <-errc:
if err != nil {
brek = true
}
case c := <-incoming:
if c == nil {
continue
}
ring2Conn = c.(*net.UnixConn)
brek = true
case <-time.After(ring2StartupTimeout):
err = fmt.Errorf("ring2 did not connect in time")
brek = true
}
if brek {
break
}
}
if err != nil {
log.WithError(err).Error("ring2 did not connect successfully")
failed = true
return
}
log.Info("signaling to child process")
_, err = msgutil.MarshalToWriter(ring2Conn, ringSyncMsg{
Stage: 1,
Rootfs: ring2Root,
FSShift: fsshift,
})
if err != nil {
log.WithError(err).Error("cannot send ring sync msg to ring2")
failed = true
return
}
log.Info("awaiting seccomp fd")
scmpfd, err := receiveSeccmpFd(ring2Conn)
if err != nil {
log.WithError(err).Error("did not receive seccomp fd from ring2")
failed = true
return
}
if scmpfd == 0 {
log.Warn("received 0 as ring2 seccomp fd - syscall handling is broken")
} else {
handler := &seccomp.InWorkspaceHandler{
FD: scmpfd,
Daemon: client,
Ring2PID: cmd.Process.Pid,
Ring2Rootfs: ring2Root,
BindEvents: make(chan seccomp.BindEvent),
}
stp, errchan := seccomp.Handle(scmpfd, handler)
defer close(stp)
go func() {
t := time.NewTicker(10 * time.Millisecond)
defer t.Stop()
for {
// We use the ticker to rate-limit the errors from the syscall handler.
// We're only handling low-frequency syscalls (e.g. mount), and don't want
// the handler to hog the CPU because it fails on its fd.
<-t.C
err := <-errchan
if err == nil {
return
}
log.WithError(err).Warn("syscall handler error")
}
}()
}
go func() {
err := lift.ServeLift(lift.DefaultSocketPath)
if err != nil {
log.WithError(err).Error("failed to serve ring1 command lift")
}
}()
err = cmd.Wait()
if err != nil {
log.WithError(err).Error("unexpected exit")
failed = true
return
}
},
}
func receiveSeccmpFd(conn *net.UnixConn) (libseccomp.ScmpFd, error) {
buf := make([]byte, unix.CmsgSpace(4))
err := conn.SetDeadline(time.Now().Add(5 * time.Second))
if err != nil {
return 0, err
}
f, err := conn.File()
if err != nil {
return 0, err
}
defer f.Close()
connfd := int(f.Fd())
_, _, _, _, err = unix.Recvmsg(connfd, nil, buf, 0)
if err != nil {
return 0, err
}
msgs, err := unix.ParseSocketControlMessage(buf)
if err != nil {
return 0, err
}
if len(msgs) != 1 {
return 0, fmt.Errorf("expected a single socket control message")
}
fds, err := unix.ParseUnixRights(&msgs[0])
if err != nil {
return 0, err
}
if len(fds) == 0 {
return 0, fmt.Errorf("expected a single socket FD")
}
return libseccomp.ScmpFd(fds[0]), nil
}
var ring2Opts struct {
SupervisorPath string
}
var ring2Cmd = &cobra.Command{
Use: "ring2 <ring1Socket>",
Short: "starts ring2",
Args: cobra.ExactArgs(1),
Run: func(_cmd *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, true)
log := log.WithField("ring", 2)
defer log.Info("done")
var failed bool
defer func() {
if !failed {
return
}
sleepForDebugging()
}()
// we talk to ring1 using a Unix socket, so that we can send the seccomp fd across.
rconn, err := net.Dial("unix", args[0])
if err != nil {
log.WithError(err).Error("cannot connect to parent")
failed = true
return
}
conn := rconn.(*net.UnixConn)
log.Info("connected to parent socket")
// Before we do anything, we wait for the parent to make /proc available to us.
var msg ringSyncMsg
_, err = msgutil.UnmarshalFromReader(conn, &msg)
if err != nil {
log.WithError(err).Error("cannot read parent message")
failed = true
return
}
if msg.Stage != 1 {
log.WithError(err).WithField("msg", fmt.Sprintf("%+q", msg)).Error("expected stage 1 sync message")
failed = true
return
}
err = pivotRoot(msg.Rootfs, msg.FSShift)
if err != nil {
log.WithError(err).Error("cannot pivot root")
failed = true
return
}
// Now that we're in our new root filesystem, including proc and all, we can load
// our seccomp filter, and tell our parent about it.
scmpFd, err := seccomp.LoadFilter()
if err != nil {
log.WithError(err).Error("cannot load seccomp filter - syscall handling would be broken")
failed = true
return
}
connf, err := conn.File()
if err != nil {
log.WithError(err).Error("cannot get parent socket fd")
failed = true
return
}
defer connf.Close()
sktfd := int(connf.Fd())
err = unix.Sendmsg(sktfd, nil, unix.UnixRights(int(scmpFd)), nil, 0)
connf.Close()
if err != nil {
log.WithError(err).Error("cannot send seccomp fd")
failed = true
return
}
err = unix.Exec(ring2Opts.SupervisorPath, []string{"supervisor", "run", "--inns"}, os.Environ())
if err != nil {
log.WithError(err).WithField("cmd", ring2Opts.SupervisorPath).Error("cannot exec")
failed = true
return
}
},
}
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
//
// copied from runc: https://github.com/opencontainers/runc/blob/cf6c074115d00c932ef01dedb3e13ba8b8f964c3/libcontainer/rootfs_linux.go#L760
func pivotRoot(rootfs string, fsshift api.FSShiftMethod) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
if fsshift == api.FSShiftMethod_FUSE {
err := unix.Chroot(rootfs)
if err != nil {
return fmt.Errorf("cannot chroot: %v", err)
}
err = unix.Chdir("/")
if err != nil {
return fmt.Errorf("cannot chdir to new root :%v", err)
}
return nil
}
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(oldroot)
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
return err
}
if err := unix.PivotRoot(".", "."); err != nil {
return fmt.Errorf("pivot_root %s", err)
}
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := unix.Fchdir(oldroot); err != nil {
return err
}
// Make oldroot rslave to make sure our unmounts don't propagate to the
// host (and thus bork the machine). We don't use rprivate because this is
// known to cause issues due to races where we still have a reference to a
// mount while a process in the host namespace are trying to operate on
// something they think has no mounts (devicemapper in particular).
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := unix.Chdir("/"); err != nil {
return fmt.Errorf("chdir / %s", err)
}
return nil
}
func sleepForDebugging() {
log.Info("sleeping five minutes to allow debugging")
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
select {
case <-sigChan:
case <-time.After(5 * time.Minute):
}
os.Exit(1)
}
type ringSyncMsg struct {
Stage int `json:"stage"`
Rootfs string `json:"rootfs"`
FSShift api.FSShiftMethod `json:"fsshift"`
}
// ConnectToInWorkspaceDaemonService attempts to connect to the InWorkspaceService offered by the ws-daemon.
func connectToInWorkspaceDaemonService(ctx context.Context) (daemonapi.InWorkspaceServiceClient, *grpc.ClientConn, error) {
const socketFN = "/.workspace/daemon.sock"
t := time.NewTicker(500 * time.Millisecond)
defer t.Stop()
for {
if _, err := os.Stat(socketFN); err == nil {
break
}
select {
case <-t.C:
continue
case <-ctx.Done():
return nil, nil, fmt.Errorf("socket did not appear before context was canceled")
}
}
conn, err := grpc.DialContext(ctx, "unix://"+socketFN, grpc.WithInsecure())
if err != nil {
return nil, nil, err
}
return daemonapi.NewInWorkspaceServiceClient(conn), conn, nil
}
func init() {
rootCmd.AddCommand(ring0Cmd)
rootCmd.AddCommand(ring1Cmd)
rootCmd.AddCommand(ring2Cmd)
supervisorPath := os.Getenv("GITPOD_WORKSPACEKIT_SUPERVISOR_PATH")
if supervisorPath == "" {
wd, err := os.Executable()
if err == nil {
wd = filepath.Dir(wd)
supervisorPath = filepath.Join(wd, "supervisor")
} else {
supervisorPath = "/.supervisor/supervisor"
}
}
ring1Cmd.Flags().BoolVar(&ring1Opts.MappingEstablished, "mapping-established", false, "true if the UID/GID mapping has already been established")
ring2Cmd.Flags().StringVar(&ring2Opts.SupervisorPath, "supervisor-path", supervisorPath, "path to the supervisor binary (taken from $GITPOD_WORKSPACEKIT_SUPERVISOR_PATH, defaults to '$PWD/supervisor')")
}