974 lines
26 KiB
Go

// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.
package cmd
import (
"bufio"
"bytes"
"context"
"encoding/json"
"fmt"
"io"
"io/ioutil"
"net"
"os"
"os/exec"
"os/signal"
"path/filepath"
"runtime"
"strconv"
"strings"
"syscall"
"time"
"github.com/rootless-containers/rootlesskit/pkg/msgutil"
"github.com/rootless-containers/rootlesskit/pkg/sigproxy"
sigproxysignal "github.com/rootless-containers/rootlesskit/pkg/sigproxy/signal"
libseccomp "github.com/seccomp/libseccomp-golang"
"github.com/spf13/cobra"
"golang.org/x/sys/unix"
"golang.org/x/xerrors"
"google.golang.org/grpc"
common_grpc "github.com/gitpod-io/gitpod/common-go/grpc"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/workspacekit/pkg/lift"
"github.com/gitpod-io/gitpod/workspacekit/pkg/seccomp"
"github.com/gitpod-io/gitpod/ws-daemon/api"
daemonapi "github.com/gitpod-io/gitpod/ws-daemon/api"
)
const (
// ring1ShutdownTimeout is the time ring1 gets between SIGTERM and SIGKILL.
// We do this to ensure we have enough time left for ring0 to clean up prior
// to receiving SIGKILL from the kubelet.
//
// This time must give ring1 enough time to shut down (see time budgets in supervisor.go),
// and to talk to ws-daemon within the terminationGracePeriod of the workspace pod.
ring1ShutdownTimeout = 20 * time.Second
// ring2StartupTimeout is the maximum time we wait between starting ring2 and its
// attempt to connect to the parent socket.
ring2StartupTimeout = 5 * time.Second
)
var ring0Cmd = &cobra.Command{
Use: "ring0",
Short: "starts ring0 - enter here",
Run: func(_ *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, false)
log := log.WithField("ring", 0)
common_grpc.SetupLogging()
exitCode := 1
defer handleExit(&exitCode)
defer log.Info("done")
ctx, cancel := context.WithTimeout(context.Background(), 30*time.Second)
defer cancel()
client, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
prep, err := client.PrepareForUserNS(ctx, &daemonapi.PrepareForUserNSRequest{})
if err != nil {
log.WithError(err).Fatal("cannot prepare for user namespaces")
return
}
client.Close()
defer func() {
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
client, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
defer client.Close()
_, err = client.Teardown(ctx, &daemonapi.TeardownRequest{})
if err != nil {
log.WithError(err).Error("cannot trigger teardown")
}
}()
cmd := exec.Command("/proc/self/exe", "ring1")
cmd.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
Cloneflags: syscall.CLONE_NEWUSER | syscall.CLONE_NEWNS | unix.CLONE_NEWCGROUP,
}
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = append(os.Environ(),
"WORKSPACEKIT_FSSHIFT="+prep.FsShift.String(),
fmt.Sprintf("WORKSPACEKIT_FULL_WORKSPACE_BACKUP=%v", prep.FullWorkspaceBackup),
)
if err := cmd.Start(); err != nil {
log.WithError(err).Error("failed to start ring0")
return
}
sigc := make(chan os.Signal, 128)
signal.Notify(sigc)
go func() {
defer func() {
// This is a 'just in case' fallback, in case we're racing the cmd.Process and it's become
// nil in the time since we checked.
err := recover()
if err != nil {
log.WithField("recovered", err).Error("recovered from panic")
}
}()
for {
sig := <-sigc
if sig != unix.SIGTERM {
_ = cmd.Process.Signal(sig)
continue
}
_ = cmd.Process.Signal(unix.SIGTERM)
time.Sleep(ring1ShutdownTimeout)
if cmd.Process == nil {
return
}
log.Warn("ring1 did not shut down in time - sending sigkill")
err = cmd.Process.Kill()
if err != nil {
if isProcessAlreadyFinished(err) {
err = nil
return
}
log.WithError(err).Error("cannot kill ring1")
}
return
}
}()
err = cmd.Wait()
if eerr, ok := err.(*exec.ExitError); ok {
state, ok := eerr.ProcessState.Sys().(syscall.WaitStatus)
if ok && state.Signal() == syscall.SIGKILL {
log.Warn("ring1 was killed")
return
}
}
if err != nil {
if eerr, ok := err.(*exec.ExitError); ok {
exitCode = eerr.ExitCode()
}
log.WithError(err).Error("unexpected exit")
return
}
exitCode = 0 // once we get here everythings good
},
}
var ring1Opts struct {
MappingEstablished bool
}
var ring1Cmd = &cobra.Command{
Use: "ring1",
Short: "starts ring1",
Run: func(_cmd *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, false)
log := log.WithField("ring", 1)
common_grpc.SetupLogging()
exitCode := 1
defer handleExit(&exitCode)
defer log.Info("done")
ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second)
defer cancel()
mapping := []*daemonapi.WriteIDMappingRequest_Mapping{
{ContainerId: 0, HostId: 33333, Size: 1},
{ContainerId: 1, HostId: 100000, Size: 65534},
}
if !ring1Opts.MappingEstablished {
client, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
defer client.Close()
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: false, Mapping: mapping})
if err != nil {
log.WithError(err).Error("cannot establish UID mapping")
return
}
_, err = client.WriteIDMapping(ctx, &daemonapi.WriteIDMappingRequest{Pid: int64(os.Getpid()), Gid: true, Mapping: mapping})
if err != nil {
log.WithError(err).Error("cannot establish GID mapping")
return
}
err = syscall.Exec("/proc/self/exe", append(os.Args, "--mapping-established"), os.Environ())
if err != nil {
log.WithError(err).Error("cannot exec /proc/self/exe")
return
}
return
}
// The parent calls child with Pdeathsig, but it is cleared when the UID/GID mapping is written.
// (see also https://github.com/rootless-containers/rootlesskit/issues/65#issuecomment-492343646).
//
// (cw) I have been able to reproduce this issue without newuidmap/newgidmap.
// See https://gist.github.com/csweichel/3fc9d4b0752367d4a436f969c8685c06
runtime.LockOSThread()
_ = unix.Prctl(unix.PR_SET_PDEATHSIG, uintptr(unix.SIGKILL), 0, 0, 0)
runtime.UnlockOSThread()
ring2Root, err := os.MkdirTemp("", "supervisor")
if err != nil {
log.WithError(err).Fatal("cannot create tempdir")
}
var fsshift api.FSShiftMethod
if v, ok := api.FSShiftMethod_value[os.Getenv("WORKSPACEKIT_FSSHIFT")]; !ok {
log.WithField("fsshift", os.Getenv("WORKSPACEKIT_FSSHIFT")).Fatal("unknown FS shift method")
} else {
fsshift = api.FSShiftMethod(v)
}
type mnte struct {
Target string
Source string
FSType string
Flags uintptr
}
var mnts []mnte
switch fsshift {
case api.FSShiftMethod_FUSE:
mnts = append(mnts,
mnte{Target: "/", Source: "/.workspace/mark", Flags: unix.MS_BIND | unix.MS_REC},
)
case api.FSShiftMethod_SHIFTFS:
mnts = append(mnts,
mnte{Target: "/", Source: "/.workspace/mark", FSType: "shiftfs"},
)
default:
log.WithField("fsshift", fsshift).Fatal("unknown FS shift method")
}
procMounts, err := ioutil.ReadFile("/proc/mounts")
if err != nil {
log.WithError(err).Fatal("cannot read /proc/mounts")
}
candidates, err := findBindMountCandidates(bytes.NewReader(procMounts), os.Readlink)
if err != nil {
log.WithError(err).Fatal("cannot detect mount candidates")
}
for _, c := range candidates {
mnts = append(mnts, mnte{Target: c, Flags: unix.MS_BIND | unix.MS_REC})
}
mnts = append(mnts, mnte{Target: "/tmp", Source: "tmpfs", FSType: "tmpfs"})
// If this is a cgroupv2 machine, we'll want to mount the cgroup2 FS ourselves
if _, err := os.Stat("/sys/fs/cgroup/cgroup.controllers"); err == nil {
mnts = append(mnts, mnte{Target: "/sys/fs/cgroup", Source: "tmpfs", FSType: "tmpfs"})
mnts = append(mnts, mnte{Target: "/sys/fs/cgroup", Source: "cgroup", FSType: "cgroup2"})
}
if adds := os.Getenv("GITPOD_WORKSPACEKIT_BIND_MOUNTS"); adds != "" {
var additionalMounts []string
err = json.Unmarshal([]byte(adds), &additionalMounts)
if err != nil {
log.WithError(err).Fatal("cannot unmarshal GITPOD_WORKSPACEKIT_BIND_MOUNTS")
}
for _, c := range additionalMounts {
mnts = append(mnts, mnte{Target: c, Flags: unix.MS_BIND | unix.MS_REC})
}
}
// FWB workspaces do not require mounting /workspace
// if that is done, the backup will not contain any change in the directory
if os.Getenv("WORKSPACEKIT_FULL_WORKSPACE_BACKUP") != "true" {
mnts = append(mnts,
mnte{Target: "/workspace", Flags: unix.MS_BIND | unix.MS_REC},
)
}
f, err := ioutil.TempDir("", "wskit-slirp4netns")
if err != nil {
log.WithError(err).Error("cannot create slirp4netns socket tempdir")
return
}
mnts = append(mnts, mnte{Target: "/.supervisor/slirp4netns.sock", Source: f, Flags: unix.MS_BIND | unix.MS_REC})
for _, m := range mnts {
dst := filepath.Join(ring2Root, m.Target)
_ = os.MkdirAll(dst, 0644)
if m.Source == "" {
m.Source = m.Target
}
if m.FSType == "" {
m.FSType = "none"
}
log.WithFields(map[string]interface{}{
"source": m.Source,
"target": dst,
"fstype": m.FSType,
"flags": m.Flags,
}).Debug("mounting new rootfs")
err = unix.Mount(m.Source, dst, m.FSType, m.Flags, "")
if err != nil {
log.WithError(err).WithField("dest", dst).WithField("fsType", m.FSType).Error("cannot establish mount")
return
}
}
// We deliberately do not bind mount `/etc/resolv.conf` and `/etc/hosts`, but instead place a copy
// so that users in the workspace can modify the file.
copyPaths := []string{"/etc/resolv.conf", "/etc/hosts"}
for _, fn := range copyPaths {
err = copyRing2Root(ring2Root, fn)
if err != nil {
log.WithError(err).Warn("cannot copy " + fn)
}
}
err = makeHostnameLocal(ring2Root)
if err != nil {
log.WithError(err).Warn("cannot make /etc/hosts hostname local")
}
env := make([]string, 0, len(os.Environ()))
for _, e := range os.Environ() {
if strings.HasPrefix(e, "WORKSPACEKIT_") {
continue
}
env = append(env, e)
}
env = append(env, "WORKSPACEKIT_WRAP_NETNS=true")
socketFN := filepath.Join(os.TempDir(), fmt.Sprintf("workspacekit-ring1-%d.unix", time.Now().UnixNano()))
skt, err := net.Listen("unix", socketFN)
if err != nil {
log.WithError(err).Error("cannot create socket for ring2")
return
}
defer skt.Close()
var (
cloneFlags uintptr = syscall.CLONE_NEWNS | syscall.CLONE_NEWPID | syscall.CLONE_NEWNET
)
cmd := exec.Command("/proc/self/exe", "ring2", socketFN)
cmd.SysProcAttr = &syscall.SysProcAttr{
Pdeathsig: syscall.SIGKILL,
Cloneflags: cloneFlags,
}
cmd.Dir = ring2Root
cmd.Stdin = os.Stdin
cmd.Stdout = os.Stdout
cmd.Stderr = os.Stderr
cmd.Env = env
if err := cmd.Start(); err != nil {
log.WithError(err).Error("failed to start the child process")
return
}
sigc := sigproxy.ForwardAllSignals(context.Background(), cmd.Process.Pid)
defer sigproxysignal.StopCatch(sigc)
procLoc := filepath.Join(ring2Root, "proc")
err = os.MkdirAll(procLoc, 0755)
if err != nil {
log.WithError(err).Error("cannot create directory for mounting proc")
return
}
client, err := connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
_, err = client.MountProc(ctx, &daemonapi.MountProcRequest{
Target: procLoc,
Pid: int64(cmd.Process.Pid),
})
if err != nil {
client.Close()
log.WithError(err).Error("cannot mount proc")
return
}
_, err = client.EvacuateCGroup(ctx, &daemonapi.EvacuateCGroupRequest{})
if err != nil {
client.Close()
log.WithError(err).Error("cannot evacuate cgroup")
return
}
client.Close()
// We have to wait for ring2 to come back to us and connect to the socket we've passed along.
// There's a chance that ring2 crashes or misbehaves, so we don't want to wait forever, hence
// the someone complicated "accept" logic below.
// If there's a deadline that can be set somewhere that we've missed, we should be using that
// one instead.
incoming := make(chan net.Conn, 1)
errc := make(chan error, 1)
go func() {
defer close(incoming)
defer close(errc)
// Accept stops the latest when we close the socket.
c, err := skt.Accept()
if err != nil {
errc <- err
return
}
incoming <- c
}()
var ring2Conn *net.UnixConn
for {
var brek bool
select {
case err = <-errc:
if err != nil {
brek = true
}
case c := <-incoming:
if c == nil {
continue
}
ring2Conn = c.(*net.UnixConn)
brek = true
case <-time.After(ring2StartupTimeout):
err = xerrors.Errorf("ring2 did not connect in time")
brek = true
}
if brek {
break
}
}
if err != nil {
log.WithError(err).Error("ring2 did not connect successfully")
return
}
client, err = connectToInWorkspaceDaemonService(ctx)
if err != nil {
log.WithError(err).Error("cannot connect to daemon")
return
}
_, err = client.SetupPairVeths(ctx, &daemonapi.SetupPairVethsRequest{Pid: int64(cmd.Process.Pid)})
if err != nil {
log.WithError(err).Error("cannot setup pair of veths")
return
}
client.Close()
log.Info("signaling to child process")
_, err = msgutil.MarshalToWriter(ring2Conn, ringSyncMsg{
Stage: 1,
Rootfs: ring2Root,
FSShift: fsshift,
})
if err != nil {
log.WithError(err).Error("cannot send ring sync msg to ring2")
return
}
log.Info("awaiting seccomp fd")
scmpfd, err := receiveSeccmpFd(ring2Conn)
if err != nil {
log.WithError(err).Error("did not receive seccomp fd from ring2")
return
}
if scmpfd == 0 {
log.Warn("received 0 as ring2 seccomp fd - syscall handling is broken")
} else {
handler := &seccomp.InWorkspaceHandler{
FD: scmpfd,
Daemon: func(ctx context.Context) (seccomp.InWorkspaceServiceClient, error) {
return connectToInWorkspaceDaemonService(ctx)
},
Ring2PID: cmd.Process.Pid,
Ring2Rootfs: ring2Root,
BindEvents: make(chan seccomp.BindEvent),
}
stp, errchan := seccomp.Handle(scmpfd, handler)
defer close(stp)
go func() {
t := time.NewTicker(10 * time.Millisecond)
defer t.Stop()
for {
// We use the ticker to rate-limit the errors from the syscall handler.
// We're only handling low-frequency syscalls (e.g. mount), and don't want
// the handler to hog the CPU because it fails on its fd.
<-t.C
err := <-errchan
if err == nil {
return
}
log.WithError(err).Warn("syscall handler error")
}
}()
}
if enclave := os.Getenv("WORKSPACEKIT_RING2_ENCLAVE"); enclave != "" {
ecmd := exec.Command("/proc/self/exe", append([]string{"nsenter", "--target", strconv.Itoa(cmd.Process.Pid), "--mount", "--net"}, strings.Fields(enclave)...)...)
ecmd.Stdout = os.Stdout
ecmd.Stderr = os.Stderr
err := ecmd.Start()
if err != nil {
log.WithError(err).WithField("cmd", enclave).Error("cannot run enclave")
return
}
}
go func() {
err := lift.ServeLift(ctx, lift.DefaultSocketPath)
if err != nil {
log.WithError(err).Error("failed to serve ring1 command lift")
}
}()
err = cmd.Wait()
if err != nil {
if eerr, ok := err.(*exec.ExitError); ok {
exitCode = eerr.ExitCode()
}
log.WithError(err).Error("unexpected exit")
return
}
exitCode = 0 // once we get here everythings good
},
}
var (
knownMountCandidatePaths = []string{
"/workspace",
"/sys",
"/dev",
"/etc/hostname",
"/etc/ssl/certs/gitpod-ca.crt",
}
rejectMountPaths = map[string]struct{}{
"/etc/resolv.conf": {},
"/etc/hosts": {},
}
)
// findBindMountCandidates attempts to find bind mount candidates in the ring0 mount namespace.
// It does that by either checking for knownMountCandidatePaths, or after rejecting based on filesystems (e.g. cgroup or proc),
// checking if in the root of the mountpoint there's a `..data` symlink pointing to a file starting with `..`.
// That's how configMaps and secrets behave in Kubernetes.
//
// Note/Caveat: configMap or secret volumes with a subPath do not behave as described above and will not be recognised by this function.
// in those cases you'll want to use GITPOD_WORKSPACEKIT_BIND_MOUNTS to explicitely list those paths.
func findBindMountCandidates(procMounts io.Reader, readlink func(path string) (dest string, err error)) (mounts []string, err error) {
scanner := bufio.NewScanner(procMounts)
for scanner.Scan() {
fields := strings.Fields(scanner.Text())
if len(fields) < 4 {
continue
}
// accept known paths
var (
path = fields[1]
accept bool
)
for _, p := range knownMountCandidatePaths {
if p == path {
accept = true
break
}
}
if accept {
mounts = append(mounts, path)
continue
}
// reject known filesystems
var (
fs = fields[0]
reject bool
)
switch fs {
case "cgroup", "devpts", "mqueue", "shm", "proc", "sysfs", "cgroup2":
reject = true
}
if reject {
continue
}
// reject known paths
if _, ok := rejectMountPaths[path]; ok {
continue
}
// test remaining candidates if they're a Kubernetes configMap or secret
ln, err := readlink(filepath.Join(path, "..data"))
if err != nil {
continue
}
if !strings.HasPrefix(ln, "..") {
continue
}
mounts = append(mounts, path)
}
return mounts, scanner.Err()
}
// copyRing2Root copies <fn> to <ring2root>/<fn>
func copyRing2Root(ring2root string, fn string) error {
stat, err := os.Stat(fn)
if err != nil {
return err
}
org, err := os.Open(fn)
if err != nil {
return err
}
defer org.Close()
dst, err := os.OpenFile(filepath.Join(ring2root, fn), os.O_CREATE|os.O_TRUNC|os.O_WRONLY, stat.Mode())
if err != nil {
return err
}
defer dst.Close()
_, err = io.Copy(dst, org)
if err != nil {
return err
}
return nil
}
func makeHostnameLocal(ring2root string) error {
hostname, err := os.Hostname()
if err != nil {
return err
}
path := filepath.Join(ring2root, "/etc/hosts")
stat, err := os.Stat(path)
if err != nil {
return err
}
b, err := ioutil.ReadFile(path)
if err != nil {
return err
}
bStr := string(b)
lines := strings.Split(bStr, "\n")
for i, line := range lines {
fields := strings.Fields(line)
if len(fields) != 2 {
continue
}
if fields[1] == hostname {
lines[i] = "127.0.0.1 " + hostname
}
}
return ioutil.WriteFile(path, []byte(strings.Join(lines, "\n")), stat.Mode())
}
func receiveSeccmpFd(conn *net.UnixConn) (libseccomp.ScmpFd, error) {
buf := make([]byte, unix.CmsgSpace(4))
err := conn.SetDeadline(time.Now().Add(5 * time.Second))
if err != nil {
return 0, xerrors.Errorf("cannot setdeadline: %v", err)
}
f, err := conn.File()
if err != nil {
return 0, xerrors.Errorf("cannot open socket: %v", err)
}
defer f.Close()
connfd := int(f.Fd())
_, _, _, _, err = unix.Recvmsg(connfd, nil, buf, 0)
if err != nil {
return 0, xerrors.Errorf("cannot recvmsg from fd '%d': %v", connfd, err)
}
msgs, err := unix.ParseSocketControlMessage(buf)
if err != nil {
return 0, xerrors.Errorf("cannot parse socket control message: %v", err)
}
if len(msgs) != 1 {
return 0, xerrors.Errorf("expected a single socket control message")
}
fds, err := unix.ParseUnixRights(&msgs[0])
if err != nil {
return 0, xerrors.Errorf("cannot parse unix rights: %v", err)
}
if len(fds) == 0 {
return 0, xerrors.Errorf("expected a single socket FD")
}
return libseccomp.ScmpFd(fds[0]), nil
}
var ring2Opts struct {
SupervisorPath string
}
var ring2Cmd = &cobra.Command{
Use: "ring2 <ring1Socket>",
Short: "starts ring2",
Args: cobra.ExactArgs(1),
Run: func(_cmd *cobra.Command, args []string) {
log.Init(ServiceName, Version, true, false)
log := log.WithField("ring", 2)
common_grpc.SetupLogging()
exitCode := 1
defer handleExit(&exitCode)
defer log.Info("done")
// we talk to ring1 using a Unix socket, so that we can send the seccomp fd across.
rconn, err := net.Dial("unix", args[0])
if err != nil {
log.WithError(err).Error("cannot connect to parent")
return
}
conn := rconn.(*net.UnixConn)
defer conn.Close()
log.Info("connected to parent socket")
// Before we do anything, we wait for the parent to make /proc available to us.
var msg ringSyncMsg
_, err = msgutil.UnmarshalFromReader(conn, &msg)
if err != nil {
log.WithError(err).Error("cannot read parent message")
return
}
if msg.Stage != 1 {
log.WithError(err).WithField("msg", fmt.Sprintf("%+q", msg)).Error("expected stage 1 sync message")
return
}
err = pivotRoot(msg.Rootfs, msg.FSShift)
if err != nil {
log.WithError(err).Error("cannot pivot root")
return
}
// Now that we're in our new root filesystem, including proc and all, we can load
// our seccomp filter, and tell our parent about it.
scmpFd, err := seccomp.LoadFilter()
if err != nil {
log.WithError(err).Error("cannot load seccomp filter - syscall handling would be broken")
return
}
connf, err := conn.File()
if err != nil {
log.WithError(err).Error("cannot get parent socket fd")
return
}
defer connf.Close()
sktfd := int(connf.Fd())
err = unix.Sendmsg(sktfd, nil, unix.UnixRights(int(scmpFd)), nil, 0)
if err != nil {
log.WithError(err).Error("cannot send seccomp fd")
return
}
err = unix.Exec(ring2Opts.SupervisorPath, []string{"supervisor", "init"}, os.Environ())
if err != nil {
if eerr, ok := err.(*exec.ExitError); ok {
exitCode = eerr.ExitCode()
}
log.WithError(err).WithField("cmd", ring2Opts.SupervisorPath).Error("cannot exec")
return
}
exitCode = 0 // once we get here everythings good
},
}
// pivotRoot will call pivot_root such that rootfs becomes the new root
// filesystem, and everything else is cleaned up.
//
// copied from runc: https://github.com/opencontainers/runc/blob/cf6c074115d00c932ef01dedb3e13ba8b8f964c3/libcontainer/rootfs_linux.go#L760
func pivotRoot(rootfs string, fsshift api.FSShiftMethod) error {
// While the documentation may claim otherwise, pivot_root(".", ".") is
// actually valid. What this results in is / being the new root but
// /proc/self/cwd being the old root. Since we can play around with the cwd
// with pivot_root this allows us to pivot without creating directories in
// the rootfs. Shout-outs to the LXC developers for giving us this idea.
oldroot, err := unix.Open("/", unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(oldroot)
newroot, err := unix.Open(rootfs, unix.O_DIRECTORY|unix.O_RDONLY, 0)
if err != nil {
return err
}
defer unix.Close(newroot)
// Change to the new root so that the pivot_root actually acts on it.
if err := unix.Fchdir(newroot); err != nil {
return err
}
if err := unix.PivotRoot(".", "."); err != nil {
return xerrors.Errorf("pivot_root %s", err)
}
// Currently our "." is oldroot (according to the current kernel code).
// However, purely for safety, we will fchdir(oldroot) since there isn't
// really any guarantee from the kernel what /proc/self/cwd will be after a
// pivot_root(2).
if err := unix.Fchdir(oldroot); err != nil {
return err
}
// Make oldroot rslave to make sure our unmounts don't propagate to the
// host (and thus bork the machine). We don't use rprivate because this is
// known to cause issues due to races where we still have a reference to a
// mount while a process in the host namespace are trying to operate on
// something they think has no mounts (devicemapper in particular).
if err := unix.Mount("", ".", "", unix.MS_SLAVE|unix.MS_REC, ""); err != nil {
return err
}
// Preform the unmount. MNT_DETACH allows us to unmount /proc/self/cwd.
if err := unix.Unmount(".", unix.MNT_DETACH); err != nil {
return err
}
// Switch back to our shiny new root.
if err := unix.Chdir("/"); err != nil {
return xerrors.Errorf("chdir / %s", err)
}
return nil
}
func handleExit(ec *int) {
exitCode := *ec
if exitCode != 0 {
sleepForDebugging()
}
os.Exit(exitCode)
}
func sleepForDebugging() {
if os.Getenv("GITPOD_WORKSPACEKIT_SLEEP_FOR_DEBUGGING") != "true" {
return
}
log.Info("sleeping five minutes to allow debugging")
sigChan := make(chan os.Signal, 1)
signal.Notify(sigChan, syscall.SIGINT, syscall.SIGTERM)
select {
case <-sigChan:
case <-time.After(5 * time.Minute):
}
}
type ringSyncMsg struct {
Stage int `json:"stage"`
Rootfs string `json:"rootfs"`
FSShift api.FSShiftMethod `json:"fsshift"`
}
type inWorkspaceServiceClient struct {
daemonapi.InWorkspaceServiceClient
conn *grpc.ClientConn
}
func (iwsc *inWorkspaceServiceClient) Close() error {
if iwsc.conn == nil {
return nil
}
return iwsc.conn.Close()
}
// ConnectToInWorkspaceDaemonService attempts to connect to the InWorkspaceService offered by the ws-daemon.
func connectToInWorkspaceDaemonService(ctx context.Context) (*inWorkspaceServiceClient, error) {
const socketFN = "/.workspace/daemon.sock"
t := time.NewTicker(500 * time.Millisecond)
defer t.Stop()
for {
if _, err := os.Stat(socketFN); err == nil {
break
}
select {
case <-t.C:
continue
case <-ctx.Done():
return nil, xerrors.Errorf("socket did not appear before context was canceled")
}
}
conn, err := grpc.DialContext(ctx, "unix://"+socketFN, grpc.WithInsecure())
if err != nil {
return nil, err
}
return &inWorkspaceServiceClient{
InWorkspaceServiceClient: daemonapi.NewInWorkspaceServiceClient(conn),
conn: conn,
}, nil
}
func init() {
rootCmd.AddCommand(ring0Cmd)
rootCmd.AddCommand(ring1Cmd)
rootCmd.AddCommand(ring2Cmd)
supervisorPath := os.Getenv("GITPOD_WORKSPACEKIT_SUPERVISOR_PATH")
if supervisorPath == "" {
wd, err := os.Executable()
if err == nil {
wd = filepath.Dir(wd)
supervisorPath = filepath.Join(wd, "supervisor")
} else {
supervisorPath = "/.supervisor/supervisor"
}
}
ring1Cmd.Flags().BoolVar(&ring1Opts.MappingEstablished, "mapping-established", false, "true if the UID/GID mapping has already been established")
ring2Cmd.Flags().StringVar(&ring2Opts.SupervisorPath, "supervisor-path", supervisorPath, "path to the supervisor binary (taken from $GITPOD_WORKSPACEKIT_SUPERVISOR_PATH, defaults to '$PWD/supervisor')")
}
func isProcessAlreadyFinished(err error) bool {
return strings.Contains(err.Error(), "os: process already finished")
}