mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
511 lines
14 KiB
Go
511 lines
14 KiB
Go
// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
|
|
// Licensed under the GNU Affero General Public License (AGPL).
|
|
// See License-AGPL.txt in the project root for license information.
|
|
|
|
package cmd
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
"io/ioutil"
|
|
"os"
|
|
"os/exec"
|
|
"os/signal"
|
|
"path/filepath"
|
|
"runtime"
|
|
"syscall"
|
|
"time"
|
|
|
|
"github.com/gitpod-io/gitpod/common-go/log"
|
|
daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api"
|
|
|
|
"github.com/rootless-containers/rootlesskit/pkg/msgutil"
|
|
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
|
|
sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
|
|
"github.com/spf13/cobra"
|
|
"golang.org/x/sys/unix"
|
|
"google.golang.org/grpc"
|
|
"kernel.org/pub/linux/libs/security/libcap/cap"
|
|
)
|
|
|
|
const (
|
|
// ring1ShutdownTimeout is the time ring1 gets between SIGTERM and SIGKILL.
|
|
// We do this to ensure we have enough time left for ring0 to clean up prior
|
|
// to receiving SIGKILL from the kubelet.
|
|
//
|
|
// This time must give ring1 enough time to shut down (see time budgets in supervisor.go),
|
|
// and to talk to ws-daemon within the terminationGracePeriod of the workspace pod.
|
|
ring1ShutdownTimeout = 20 * time.Second
|
|
)
|
|
|
|
var ring0Cmd = &cobra.Command{
|
|
Use: "ring0",
|
|
Short: "starts ring0 - enter here",
|
|
Run: func(_ *cobra.Command, args []string) {
|
|
log.Init(ServiceName, Version, true, true)
|
|
log := log.WithField("ring", 0)
|
|
|
|
var failed bool
|
|
defer func() {
|
|
if !failed {
|
|
return
|
|
}
|
|
sleepForDebugging()
|
|
}()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
client, conn, err := connectToInWorkspaceDaemonService(ctx)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot connect to daemon")
|
|
return
|
|
}
|
|
defer conn.Close()
|
|
|
|
_, err = client.PrepareForUserNS(ctx, &daemonapi.PrepareForUserNSRequest{})
|
|
if err != nil {
|
|
log.WithError(err).Fatal("cannot prepare for user namespaces")
|
|
return
|
|
}
|
|
defer func() {
|
|
ctx, cancel = context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
_, err = client.Teardown(ctx, &daemonapi.TeardownRequest{})
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot trigger teardown")
|
|
failed = true
|
|
return
|
|
}
|
|
}()
|
|
|
|
cmd := exec.Command("/proc/self/exe", "ring1")
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Pdeathsig: syscall.SIGKILL,
|
|
Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS,
|
|
}
|
|
cmd.Stdin = os.Stdin
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
cmd.Env = os.Environ()
|
|
|
|
if err := cmd.Start(); err != nil {
|
|
log.WithError(err).Error("failed to start ring0")
|
|
failed = true
|
|
return
|
|
}
|
|
|
|
sigc := make(chan os.Signal, 128)
|
|
signal.Notify(sigc)
|
|
go func() {
|
|
defer func() {
|
|
// This is a 'just in case' fallback, in case we're racing the cmd.Process and it's become
|
|
// nil in the time since we checked.
|
|
err := recover()
|
|
if err != nil {
|
|
log.WithField("recovered", err).Error("recovered from panic")
|
|
}
|
|
}()
|
|
|
|
for {
|
|
sig := <-sigc
|
|
if sig != unix.SIGTERM {
|
|
cmd.Process.Signal(sig)
|
|
continue
|
|
}
|
|
|
|
cmd.Process.Signal(unix.SIGTERM)
|
|
time.Sleep(ring1ShutdownTimeout)
|
|
if cmd.Process == nil {
|
|
return
|
|
}
|
|
|
|
log.Warn("ring1 did not shut down in time - sending sigkill")
|
|
err = cmd.Process.Kill()
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot kill ring1")
|
|
}
|
|
return
|
|
}
|
|
}()
|
|
|
|
err = cmd.Wait()
|
|
if eerr, ok := err.(*exec.ExitError); ok {
|
|
state, ok := eerr.ProcessState.Sys().(syscall.WaitStatus)
|
|
if ok && state.Signal() == syscall.SIGKILL {
|
|
log.Warn("ring1 was killed")
|
|
return
|
|
}
|
|
}
|
|
if err != nil {
|
|
log.WithError(err).Error("unexpected exit")
|
|
failed = true
|
|
return
|
|
}
|
|
},
|
|
}
|
|
|
|
var ring1Opts struct {
|
|
MappingEstablished bool
|
|
}
|
|
var ring1Cmd = &cobra.Command{
|
|
Use: "ring1",
|
|
Short: "starts ring1",
|
|
Run: func(_cmd *cobra.Command, args []string) {
|
|
log.Init(ServiceName, Version, true, true)
|
|
log := log.WithField("ring", 1)
|
|
defer log.Info("done")
|
|
|
|
var failed bool
|
|
defer func() {
|
|
if !failed {
|
|
return
|
|
}
|
|
sleepForDebugging()
|
|
}()
|
|
|
|
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
|
|
defer cancel()
|
|
|
|
client, conn, err := connectToInWorkspaceDaemonService(ctx)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot connect to daemon")
|
|
failed = true
|
|
return
|
|
}
|
|
defer conn.Close()
|
|
|
|
if !ring1Opts.MappingEstablished {
|
|
mapping := []*daemonapi.WriteIDMappingRequest_Mapping{
|
|
{ContainerId: 0, HostId: 33333, Size: 1},
|
|
{ContainerId: 1, HostId: 100000, Size: 65534},
|
|
}
|
|
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: false, Mapping: mapping})
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot establish UID mapping")
|
|
failed = true
|
|
return
|
|
}
|
|
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: true, Mapping: mapping})
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot establish GID mapping")
|
|
failed = true
|
|
return
|
|
}
|
|
err = syscall.Exec("/proc/self/exe", append(os.Args, "--mapping-established"), os.Environ())
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot exec /proc/self/exe")
|
|
failed = true
|
|
return
|
|
}
|
|
return
|
|
}
|
|
|
|
// The parent calls child with Pdeathsig, but it is cleared when the UID/GID mapping is written.
|
|
// (see also https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646).
|
|
//
|
|
// (cw) I have been able to reproduce this issue without newuidmap/newgidmap.
|
|
// See https://gist.github.com/csweichel/3fc9d4b0752367d4a436f969c8685c06
|
|
runtime.LockOSThread()
|
|
unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
|
|
runtime.UnlockOSThread()
|
|
|
|
tmpdir, err := ioutil.TempDir("", "supervisor")
|
|
if err != nil {
|
|
log.WithError(err).Fatal("cannot create tempdir")
|
|
}
|
|
|
|
mnts := []struct {
|
|
Target string
|
|
Source string
|
|
FSType string
|
|
Flags uintptr
|
|
}{
|
|
// TODO(cw): pull mark mount location from config
|
|
{Target: "/", Source: "/.workspace/mark", FSType: "shiftfs"},
|
|
{Target: "/sys", Flags: unix.MS_BIND | unix.MS_REC},
|
|
{Target: "/dev", Flags: unix.MS_BIND | unix.MS_REC},
|
|
// TODO(cw): only mount /theia if it's in the mount table, i.e. this isn't a registry-facade workspace
|
|
{Target: "/theia", Flags: unix.MS_BIND | unix.MS_REC},
|
|
// TODO(cw): only mount /workspace if it's in the mount table, i.e. this isn't an FWB workspace
|
|
{Target: "/workspace", Flags: unix.MS_BIND | unix.MS_REC},
|
|
{Target: "/etc/hosts", Flags: unix.MS_BIND | unix.MS_REC},
|
|
{Target: "/etc/hostname", Flags: unix.MS_BIND | unix.MS_REC},
|
|
{Target: "/etc/resolv.conf", Flags: unix.MS_BIND | unix.MS_REC},
|
|
{Target: "/tmp", Source: "tmpfs", FSType: "tmpfs"},
|
|
}
|
|
for _, m := range mnts {
|
|
dst := filepath.Join(tmpdir, m.Target)
|
|
_ = os.MkdirAll(dst, 0644)
|
|
|
|
if m.Source == "" {
|
|
m.Source = m.Target
|
|
}
|
|
if m.FSType == "" {
|
|
m.FSType = "none"
|
|
}
|
|
|
|
log.WithFields(map[string]interface{}{
|
|
"source": m.Source,
|
|
"target": dst,
|
|
"fstype": m.FSType,
|
|
"flags": m.Flags,
|
|
}).Debug("mounting new rootfs")
|
|
err = unix.Mount(m.Source, dst, m.FSType, m.Flags, "")
|
|
if err != nil {
|
|
log.WithError(err).WithField("dest", dst).Error("cannot establish mount")
|
|
failed = true
|
|
return
|
|
}
|
|
}
|
|
|
|
pipeR, pipeW, err := os.Pipe()
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot mount create pipe")
|
|
failed = true
|
|
return
|
|
}
|
|
defer pipeW.Close()
|
|
|
|
cmd := exec.Command("/proc/self/exe", "ring2")
|
|
cmd.SysProcAttr = &syscall.SysProcAttr{
|
|
Pdeathsig: syscall.SIGKILL,
|
|
Cloneflags: syscall.CLONE_NEWNS | syscall.CLONE_NEWPID,
|
|
}
|
|
cmd.ExtraFiles = []*os.File{pipeR}
|
|
cmd.Dir = tmpdir
|
|
cmd.Stdin = os.Stdin
|
|
cmd.Stdout = os.Stdout
|
|
cmd.Stderr = os.Stderr
|
|
cmd.Env = os.Environ()
|
|
if err := cmd.Start(); err != nil {
|
|
log.WithError(err).Error("failed to start the child process")
|
|
failed = true
|
|
return
|
|
}
|
|
sigc := sigproxy.ForwardAllSignals(context.Background(), cmd.Process.Pid)
|
|
defer sigproxysignal.StopCatch(sigc)
|
|
|
|
procLoc := filepath.Join(tmpdir, "proc")
|
|
err = os.MkdirAll(procLoc, 0755)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot mount proc")
|
|
failed = true
|
|
return
|
|
}
|
|
_, err = client.MountProc(ctx, &daemonapi.MountProcRequest{
|
|
Target: procLoc,
|
|
Pid: int64(cmd.Process.Pid),
|
|
})
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot mount proc")
|
|
failed = true
|
|
return
|
|
}
|
|
|
|
log.Info("signaling to child process")
|
|
_, err = msgutil.MarshalToWriter(pipeW, ringSyncMsg{
|
|
Stage: 1,
|
|
Rootfs: tmpdir,
|
|
})
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot send signal to child process")
|
|
failed = true
|
|
return
|
|
}
|
|
|
|
err = cmd.Wait()
|
|
if err != nil {
|
|
log.WithError(err).Error("unexpected exit")
|
|
failed = true
|
|
return
|
|
}
|
|
},
|
|
}
|
|
|
|
var ring2Opts struct {
|
|
SupervisorPath string
|
|
}
|
|
var ring2Cmd = &cobra.Command{
|
|
Use: "ring2",
|
|
Short: "starts ring2",
|
|
Run: func(_cmd *cobra.Command, args []string) {
|
|
log.Init(ServiceName, Version, true, true)
|
|
log := log.WithField("ring", 2)
|
|
defer log.Info("done")
|
|
|
|
var failed bool
|
|
defer func() {
|
|
if !failed {
|
|
return
|
|
}
|
|
sleepForDebugging()
|
|
}()
|
|
|
|
// wait for /proc by listening on the parent's pipe.
|
|
// fd=3 is the pipe's FD passed in from the parent via extraFiles.
|
|
pipeR := os.NewFile(uintptr(3), "")
|
|
var msg ringSyncMsg
|
|
_, err := msgutil.UnmarshalFromReader(pipeR, &msg)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot read from parent pipe")
|
|
failed = true
|
|
return
|
|
}
|
|
if msg.Stage != 1 {
|
|
log.WithError(err).WithField("msg", fmt.Sprintf("%+q", msg)).Error("expected stage 1 sync message")
|
|
failed = true
|
|
return
|
|
}
|
|
|
|
err = pivotRoot(msg.Rootfs)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot pivot root")
|
|
failed = true
|
|
return
|
|
}
|
|
|
|
err = cap.SetGroups(33333)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot setgid")
|
|
failed = true
|
|
return
|
|
}
|
|
err = cap.SetUID(33333)
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot setuid")
|
|
failed = true
|
|
return
|
|
}
|
|
err = unix.Exec(ring2Opts.SupervisorPath, []string{"supervisor", "run", "--inns"}, os.Environ())
|
|
if err != nil {
|
|
log.WithError(err).Error("cannot exec")
|
|
failed = true
|
|
return
|
|
}
|
|
},
|
|
}
|
|
|
|
// pivotRoot will call pivot_root such that rootfs becomes the new root
|
|
// filesystem, and everything else is cleaned up.
|
|
//
|
|
// copied from runc: https://github.com/opencontainers/runc/blob/cf6c074115d00c932ef01dedb3e13ba8b8f964c3/libcontainer/rootfs_linux.go#L760
|
|
func pivotRoot(rootfs string) error {
|
|
// While the documentation may claim otherwise, pivot_root(".", ".") is
|
|
// actually valid. What this results in is / being the new root but
|
|
// /proc/self/cwd being the old root. Since we can play around with the cwd
|
|
// with pivot_root this allows us to pivot without creating directories in
|
|
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
|
|
|
|
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer unix.Close(oldroot)
|
|
|
|
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
|
|
if err != nil {
|
|
return err
|
|
}
|
|
defer unix.Close(newroot)
|
|
|
|
// Change to the new root so that the pivot_root actually acts on it.
|
|
if err := unix.Fchdir(newroot); err != nil {
|
|
return err
|
|
}
|
|
|
|
if err := unix.PivotRoot(".", "."); err != nil {
|
|
return fmt.Errorf("pivot_root %s", err)
|
|
}
|
|
|
|
// Currently our "." is oldroot (according to the current kernel code).
|
|
// However, purely for safety, we will fchdir(oldroot) since there isn't
|
|
// really any guarantee from the kernel what /proc/self/cwd will be after a
|
|
// pivot_root(2).
|
|
|
|
if err := unix.Fchdir(oldroot); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Make oldroot rslave to make sure our unmounts don't propagate to the
|
|
// host (and thus bork the machine). We don't use rprivate because this is
|
|
// known to cause issues due to races where we still have a reference to a
|
|
// mount while a process in the host namespace are trying to operate on
|
|
// something they think has no mounts (devicemapper in particular).
|
|
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
|
|
return err
|
|
}
|
|
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
|
|
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
|
|
return err
|
|
}
|
|
|
|
// Switch back to our shiny new root.
|
|
if err := unix.Chdir("/"); err != nil {
|
|
return fmt.Errorf("chdir / %s", err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
func sleepForDebugging() {
|
|
log.Info("sleeping five minutes to allow debugging")
|
|
sigChan := make(chan os.Signal, 1)
|
|
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
|
|
select {
|
|
case <-sigChan:
|
|
case <-time.After(5 * time.Minute):
|
|
}
|
|
os.Exit(1)
|
|
}
|
|
|
|
type ringSyncMsg struct {
|
|
Stage int `json:"stage"`
|
|
Rootfs string `json:"rootfs"`
|
|
}
|
|
|
|
// ConnectToInWorkspaceDaemonService attempts to connect to the InWorkspaceService offered by the ws-daemon.
|
|
func connectToInWorkspaceDaemonService(ctx context.Context) (daemonapi.InWorkspaceServiceClient, *grpc.ClientConn, error) {
|
|
const socketFN = "/.workspace/daemon.sock"
|
|
|
|
t := time.NewTicker(500 * time.Millisecond)
|
|
defer t.Stop()
|
|
for {
|
|
if _, err := os.Stat(socketFN); err == nil {
|
|
break
|
|
}
|
|
|
|
select {
|
|
case <-t.C:
|
|
continue
|
|
case <-ctx.Done():
|
|
return nil, nil, fmt.Errorf("socket did not appear before context was canceled")
|
|
}
|
|
}
|
|
|
|
conn, err := grpc.DialContext(ctx, "unix://"+socketFN, grpc.WithInsecure())
|
|
if err != nil {
|
|
return nil, nil, err
|
|
}
|
|
return daemonapi.NewInWorkspaceServiceClient(conn), conn, nil
|
|
}
|
|
|
|
func init() {
|
|
rootCmd.AddCommand(ring0Cmd)
|
|
rootCmd.AddCommand(ring1Cmd)
|
|
rootCmd.AddCommand(ring2Cmd)
|
|
|
|
supervisorPath := os.Getenv("GITPOD_WORKSPACEKIT_SUPERVISOR_PATH")
|
|
if supervisorPath == "" {
|
|
wd, err := os.Getwd()
|
|
if err == nil {
|
|
supervisorPath = "supervisor"
|
|
} else {
|
|
supervisorPath = filepath.Join(wd, "supervisor")
|
|
}
|
|
}
|
|
|
|
ring1Cmd.Flags().BoolVar(&ring1Opts.MappingEstablished, "mapping-established", false, "true if the UID/GID mapping has already been established")
|
|
ring2Cmd.Flags().StringVar(&ring2Opts.SupervisorPath, "supervisor-path", supervisorPath, "path to the supervisor binary (taken from $GITPOD_WORKSPACEKIT_SUPERVISOR_PATH, defaults to '$PWD/supervisor')")
|
|
}
|