gitpod/components/ws-manager-mk2/controllers/workspace_controller.go

// Copyright (c) 2022 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License-AGPL.txt in the project root for license information.

package controllers

import (
	"context"
	"fmt"
	"strings"
	"sync"
	"time"

	"golang.org/x/xerrors"
	"google.golang.org/grpc"
	"google.golang.org/grpc/codes"
	"google.golang.org/grpc/credentials"
	"google.golang.org/grpc/status"
	"google.golang.org/protobuf/proto"
	corev1 "k8s.io/api/core/v1"
	"k8s.io/apimachinery/pkg/api/errors"
	metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
	"k8s.io/apimachinery/pkg/labels"
	"k8s.io/apimachinery/pkg/runtime"
	"k8s.io/apimachinery/pkg/types"
	"k8s.io/client-go/util/retry"
	"k8s.io/utils/pointer"
	ctrl "sigs.k8s.io/controller-runtime"
	"sigs.k8s.io/controller-runtime/pkg/client"
	"sigs.k8s.io/controller-runtime/pkg/log"

	common_grpc "github.com/gitpod-io/gitpod/common-go/grpc"
	"github.com/gitpod-io/gitpod/common-go/tracing"
	csapi "github.com/gitpod-io/gitpod/content-service/api"
	wsdaemon "github.com/gitpod-io/gitpod/ws-daemon/api"
	"github.com/gitpod-io/gitpod/ws-manager-mk2/grpcpool"
	config "github.com/gitpod-io/gitpod/ws-manager/api/config"
	workspacev1 "github.com/gitpod-io/gitpod/ws-manager/api/crd/v1"
)

const (
	// wsdaemonMaxAttempts is the number of times we'll attempt to work with ws-daemon when a former attempt returned unavailable.
	// We rety for two minutes every 5 seconds (see wwsdaemonRetryInterval).
	//
	// Note: this is a variable rather than a constant so that tests can modify this value.
	wsdaemonMaxAttempts = 120 / 5

	// wsdaemonRetryInterval is the time in between attempts to work with ws-daemon.
	//
	// Note: this is a variable rather than a constant so that tests can modify this value.
	wsdaemonRetryInterval = 5 * time.Second

	// wsdaemonDialTimeout is the time we allow for trying to connect to ws-daemon.
	// Note: this is NOT the time we allow for RPC calls to wsdaemon, but just for establishing the connection.
	wsdaemonDialTimeout = 10 * time.Second
)

func NewWorkspaceReconciler(c client.Client, scheme *runtime.Scheme, cfg config.Configuration) (*WorkspaceReconciler, error) {
	wsdaemonConnfactory, err := newWsdaemonConnectionFactory(cfg)
	if err != nil {
		return nil, err
	}

	res := &WorkspaceReconciler{
		Client:         c,
		Scheme:         scheme,
		Config:         cfg,
		initializerMap: make(map[string]struct{}),
		finalizerMap:   make(map[string]context.CancelFunc),
		wsdaemonPool:   grpcpool.New(wsdaemonConnfactory, checkWSDaemonEndpoint(cfg.Namespace, c)),
	}
	res.actingManager = res
	return res, nil
}

// WorkspaceReconciler reconciles a Workspace object
type WorkspaceReconciler struct {
	client.Client
	Scheme *runtime.Scheme

	Config      config.Configuration
	OnReconcile func(ctx context.Context, ws *workspacev1.Workspace)

	wsdaemonPool *grpcpool.Pool

	actingManager      actingManager
	initializerMap     map[string]struct{}
	initializerMapLock sync.Mutex
	finalizerMap       map[string]context.CancelFunc
	finalizerMapLock   sync.Mutex
}

// actingManager contains all functions needed by actOnPodEvent
type actingManager interface {
	// clearInitializerFromMap(podName string)
	initializeWorkspaceContent(ctx context.Context, ws *workspacev1.Workspace) (err error)
	finalizeWorkspaceContent(ctx context.Context, ws *workspacev1.Workspace)
}

//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/status,verbs=get;update;patch
//+kubebuilder:rbac:groups=workspace.gitpod.io,resources=workspaces/finalizers,verbs=update
//+kubebuilder:rbac:groups=core,resources=pod,verbs=get;list;watch;create;update;patch;delete
//+kubebuilder:rbac:groups=core,resources=pod/status,verbs=get

// Reconcile is part of the main kubernetes reconciliation loop which aims to
// move the current state of the cluster closer to the desired state.
// TODO(user): Modify the Reconcile function to compare the state specified by
// the Workspace object against the actual cluster state, and then
// perform operations to make the cluster state reflect the state specified by
// the user.
//
// For more details, check Reconcile and its Result here:
// - https://pkg.go.dev/sigs.k8s.io/controller-runtime@v0.11.0/pkg/reconcile
func (r *WorkspaceReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) {
	log := log.FromContext(ctx)

	var workspace workspacev1.Workspace
	if err := r.Get(ctx, req.NamespacedName, &workspace); err != nil {
		// TODO(cw): create pdo
		log.Error(err, "unable to fetch workspace")
		// we'll ignore not-found errors, since they can't be fixed by an immediate
		// requeue (we'll need to wait for a new notification), and we can get them
		// on deleted requests.
		return ctrl.Result{}, client.IgnoreNotFound(err)
	}

	log.Info("reconciling workspace", "ws", req.NamespacedName)

	var workspacePods corev1.PodList
	err := r.List(ctx, &workspacePods, client.InNamespace(req.Namespace), client.MatchingFields{wsOwnerKey: req.Name})
	if err != nil {
		log.Error(err, "unable to list workspace pods")
		return ctrl.Result{}, err
	}

	err = updateWorkspaceStatus(ctx, &workspace, workspacePods)
	if err != nil {
		return ctrl.Result{}, err
	}

	result, err := r.actOnStatus(ctx, &workspace, workspacePods)
	if err != nil {
		return result, err
	}

	err = r.Status().Update(ctx, &workspace)
	if err != nil {
		log.Error(err, "unable to update workspace status")
		return ctrl.Result{Requeue: true}, err
	}

	if r.OnReconcile != nil {
		r.OnReconcile(ctx, &workspace)
	}

	return ctrl.Result{}, nil
}

func (r *WorkspaceReconciler) actOnStatus(ctx context.Context, workspace *workspacev1.Workspace, workspacePods corev1.PodList) (ctrl.Result, error) {
	logger := log.FromContext(ctx)

	// if there isn't a workspace pod and we're not currently deleting this workspace,
	// create one.
	if len(workspacePods.Items) == 0 && workspace.Status.PodStarts == 0 {
		sctx, err := newStartWorkspaceContext(ctx, &r.Config, workspace)
		if err != nil {
			logger.Error(err, "unable to create startWorkspace context")
			return ctrl.Result{Requeue: true}, err
		}

		pod, err := r.createWorkspacePod(sctx)
		if err != nil {
			logger.Error(err, "unable to produce workspace pod")
			return ctrl.Result{}, err
		}

		err = r.Create(ctx, pod)
		if errors.IsAlreadyExists(err) {
			// pod exists, we're good
		} else if err != nil {
			logger.Error(err, "unable to create Pod for Workspace", "pod", pod)
			return ctrl.Result{Requeue: true}, err
		} else {
			// TODO(cw): replicate the startup mechanism where pods can fail to be scheduled,
			//			 need to be deleted and re-created
			workspace.Status.PodStarts++
		}

		return ctrl.Result{}, nil
	}

	// all actions below assume there is a pod
	if len(workspacePods.Items) == 0 {
		return ctrl.Result{}, nil
	}
	pod := &workspacePods.Items[0]

	switch {
	// if there is a pod, and it's failed, delete it
	case workspace.Status.Conditions.Failed != "" && !isPodBeingDeleted(pod):
		err := r.Client.Delete(ctx, pod)
		if errors.IsNotFound(err) {
			// pod is gone - nothing to do here
		} else {
			return ctrl.Result{Requeue: true}, err
		}

	// if the pod was stopped by request, delete it
	case pointer.BoolDeref(workspace.Status.Conditions.StoppedByRequest, false) && !isPodBeingDeleted(pod):
		err := r.Client.Delete(ctx, pod)
		if errors.IsNotFound(err) {
			// pod is gone - nothing to do here
		} else {
			return ctrl.Result{Requeue: true}, err
		}

	// pod is creating and sits on a node - start initializing content
	case (workspace.Status.Phase == workspacev1.WorkspacePhaseCreating ||
		workspace.Status.Phase == workspacev1.WorkspacePhaseInitializing):

		go func() {
			initCtx, cancel := context.WithTimeout(context.Background(), 5*time.Minute)
			initCtx = log.IntoContext(initCtx, logger)
			defer cancel()

			err := r.actingManager.initializeWorkspaceContent(initCtx, workspace)
			if err == nil {
				return
			}

			// workspace initialization failed, which means the workspace as a whole failed
			msg := err.Error()
			logger.Error(err, "unable to initialize workspace")
			err = retry.RetryOnConflict(retry.DefaultBackoff, func() error {
				ctx := context.Background()

				var ws workspacev1.Workspace
				err := r.Client.Get(ctx, types.NamespacedName{Namespace: r.Config.Namespace, Name: workspace.Name}, &ws)
				if err != nil {
					return err
				}

				ws.Status.Conditions.Failed = msg

				return r.Status().Update(ctx, &ws)
			})
			if err != nil {
				logger.Error(err, "was unable to mark workspace as failed")
			}
		}()

	case workspace.Status.Phase == workspacev1.WorkspacePhaseStopping:
		var terminated bool
		for _, c := range pod.Status.ContainerStatuses {
			if c.Name == "workspace" {
				// Note: sometimes container don't enter `terminated`, but `waiting`. The processes are stopped nonetheless,
				//       and we should be running the backup. The only thing that keeps the pod alive, is our finalizer.
				terminated = c.State.Running == nil
				break
			}
		}

		if terminated {
			// We start finalizing the workspace content only after the container is gone. This way we ensure there's
			// no process modifying the workspace content as we create the backup.
			go r.actingManager.finalizeWorkspaceContent(ctx, workspace)
		}

	// we've disposed already - try to remove the finalizer and call it a day
	case workspace.Status.Phase == workspacev1.WorkspacePhaseStopped:
		var foundFinalizer bool
		n := 0
		for _, x := range pod.Finalizers {
			if x != gitpodPodFinalizerName {
				pod.Finalizers[n] = x
				n++
			} else {
				foundFinalizer = true
			}
		}
		pod.Finalizers = pod.Finalizers[:n]
		err := r.Client.Update(ctx, pod)
		if err != nil {
			return ctrl.Result{Requeue: true}, err
		}

		if foundFinalizer {
			// reque to remove workspace
			return ctrl.Result{RequeueAfter: 10 * time.Second}, nil
		} else {
			err = r.Client.Delete(ctx, workspace)
			if err != nil {
				return ctrl.Result{Requeue: true}, err
			}
		}
	}

	return ctrl.Result{}, nil
}

// finalizeWorkspaceContent talks to a ws-daemon daemon on the node of the pod and creates a backup of the workspace content.
func (r *WorkspaceReconciler) finalizeWorkspaceContent(ctx context.Context, workspace *workspacev1.Workspace) {
	log := log.FromContext(ctx)

	var disposalStatus *workspacev1.WorkspaceDisposalStatus
	defer func() {
		if disposalStatus == nil {
			return
		}

		err := retry.RetryOnConflict(retry.DefaultBackoff, func() error {
			var ws workspacev1.Workspace
			err := r.Client.Get(ctx, types.NamespacedName{Namespace: r.Config.Namespace, Name: workspace.Name}, &ws)
			if err != nil {
				return err
			}

			ws.Status.Disposal = disposalStatus
			return r.Client.Update(ctx, &ws)
		})
		if err != nil {
			log.Error(err, "was unable to update pod's disposal state - this will break someone's experience")
		}
	}()

	doBackup := workspace.Status.Conditions.EverReady && !workspace.Status.Headless
	doBackupLogs := workspace.Spec.Type == workspacev1.WorkspaceTypePrebuild
	doSnapshot := workspace.Spec.Type == workspacev1.WorkspaceTypePrebuild
	doFinalize := func() (worked bool, gitStatus *workspacev1.GitStatus, err error) {
		r.finalizerMapLock.Lock()
		_, alreadyFinalizing := r.finalizerMap[workspace.Name]
		if alreadyFinalizing {
			r.finalizerMapLock.Unlock()
			return false, nil, nil
		}

		var hostIP string
		if workspace.Status.Runtime != nil {
			hostIP = workspace.Status.Runtime.HostIP
		}

		// Maybe the workspace never made it to a phase where we actually initialized a workspace.
		// Assuming that once we've had a nodeName we've spoken to ws-daemon it's safe to assume that if
		// we don't have a nodeName we don't need to dipose the workspace.
		// Obviously that only holds if we do not require a backup. If we do require one, we want to
		// fail as loud as we can in this case.
		if !doBackup && !doSnapshot && hostIP == "" {
			// we don't need a backup and have never spoken to ws-daemon: we're good here.
			r.finalizerMapLock.Unlock()
			return true, nil, nil
		}

		// we're not yet finalizing - start the process
		snc, err := r.connectToWorkspaceDaemon(ctx, workspace)
		if err != nil {
			r.finalizerMapLock.Unlock()
			return true, nil, err
		}

		ctx, cancelReq := context.WithTimeout(ctx, time.Duration(r.Config.Timeouts.ContentFinalization))
		r.finalizerMap[workspace.Name] = cancelReq
		r.finalizerMapLock.Unlock()
		defer func() {
			// we're done disposing - remove from the finalizerMap
			r.finalizerMapLock.Lock()
			delete(r.finalizerMap, workspace.Name)
			r.finalizerMapLock.Unlock()
		}()

		if doSnapshot {
			// if this is a prebuild take a snapshot and mark the workspace
			var res *wsdaemon.TakeSnapshotResponse
			res, err = snc.TakeSnapshot(ctx, &wsdaemon.TakeSnapshotRequest{Id: workspace.Name})
			if err != nil {
				log.Error(err, "cannot take snapshot")
				err = xerrors.Errorf("cannot take snapshot: %v", err)
			}

			if res != nil {
				r.modifyWorkspaceStatus(ctx, workspace.Name, func(status *workspacev1.WorkspaceStatus) error {
					results := status.Results
					if res == nil {
						results = &workspacev1.WorkspaceResults{}
					}
					results.Snapshot = res.Url
					status.Results = results
					return nil
				})
				if err != nil {
					log.Error(err, "cannot mark headless workspace with snapshot - that's one prebuild lost")
					err = xerrors.Errorf("cannot remember snapshot: %v", err)
				}
			}
		}

		// DiposeWorkspace will "degenerate" to a simple wait if the finalization/disposal process is already running.
		// This is unlike the initialization process where we wait for things to finish in a later phase.
		resp, err := snc.DisposeWorkspace(ctx, &wsdaemon.DisposeWorkspaceRequest{
			Id:         workspace.Name,
			Backup:     doBackup,
			BackupLogs: doBackupLogs,
		})
		if resp != nil {
			gitStatus = gitStatusfromContentServiceAPI(resp.GitStatus)
		}
		return true, gitStatus, err
	}

	var (
		dataloss    bool
		backupError error
		gitStatus   *workspacev1.GitStatus
	)
	for i := 0; i < wsdaemonMaxAttempts; i++ {
		didSometing, gs, err := doFinalize()
		if !didSometing {
			// someone else is managing finalization process ... we don't have to bother
			return
		}

		// by default we assume the worst case scenario. If things aren't just as bad, we'll tune it down below.
		dataloss = true
		backupError = err
		gitStatus = gs

		// At this point one of three things may have happened:
		//   1. the context deadline was exceeded, e.g. due to misconfiguration (not enough time to upload) or network issues. We'll try again.
		//   2. the service was unavailable, in which case we'll try again.
		//   3. none of the above, in which case we'll give up
		st, isGRPCError := status.FromError(err)
		if !isGRPCError {
			break
		}

		if (err != nil && strings.Contains(err.Error(), context.DeadlineExceeded.Error())) ||
			st.Code() == codes.Unavailable ||
			st.Code() == codes.Canceled {
			// service is currently unavailable or we did not finish in time - let's wait some time and try again
			time.Sleep(wsdaemonRetryInterval)
			continue
		}

		// service was available, we've tried to do the work and failed. Tell the world about it.
		if (doBackup || doSnapshot) && isGRPCError {
			switch st.Code() {
			case codes.DataLoss:
				// ws-daemon told us that it's lost data
				dataloss = true
			case codes.FailedPrecondition:
				// the workspace content was not in the state we thought it was
				dataloss = true
			}
		}
		break
	}

	disposalStatus = &workspacev1.WorkspaceDisposalStatus{
		BackupComplete: true,
		GitStatus:      gitStatus,
	}
	if backupError != nil {
		if dataloss {
			disposalStatus.BackupFailure = backupError.Error()
		} else {
			// internal errors make no difference to the user experience. The backup still worked, we just messed up some
			// state management or cleanup. No need to worry the user.
			log.Error(backupError, "internal error while disposing workspace content")
		}
	}
	err := r.modifyWorkspaceStatus(ctx, workspace.Name, func(status *workspacev1.WorkspaceStatus) error {
		status.Disposal = disposalStatus
		log.V(1).Info("setting disposal status", "instanceID", workspace.Name, "status", *disposalStatus)
		return nil
	})
	if err != nil {
		log.Error(err, "cannot update workspace disposal status")
	}
}

// finalizeWorkspaceContent talks to a ws-daemon daemon on the node of the pod and creates a backup of the workspace content.
func (r *WorkspaceReconciler) finalizeWorkspaceContent(ctx context.Context, workspace *workspacev1.Workspace) {
	log := log.FromContext(ctx)

	doBackup := conditionPresentAndTrue(workspace.Status.Conditions, string(workspacev1.WorkspaceConditionEverReady)) && !workspace.Status.Headless
	doBackupLogs := workspace.Spec.Type == workspacev1.WorkspaceTypePrebuild
	doSnapshot := workspace.Spec.Type == workspacev1.WorkspaceTypePrebuild
	doFinalize := func() (worked bool, gitStatus *workspacev1.GitStatus, err error) {
		r.finalizerMapLock.Lock()
		_, alreadyFinalizing := r.finalizerMap[workspace.Name]
		if alreadyFinalizing {
			r.finalizerMapLock.Unlock()
			return false, nil, nil
		}

		var hostIP string
		if workspace.Status.Runtime != nil {
			hostIP = workspace.Status.Runtime.HostIP
		}

		// Maybe the workspace never made it to a phase where we actually initialized a workspace.
		// Assuming that once we've had a nodeName we've spoken to ws-daemon it's safe to assume that if
		// we don't have a nodeName we don't need to dipose the workspace.
		// Obviously that only holds if we do not require a backup. If we do require one, we want to
		// fail as loud as we can in this case.
		if !doBackup && !doSnapshot && hostIP == "" {
			// we don't need a backup and have never spoken to ws-daemon: we're good here.
			r.finalizerMapLock.Unlock()
			return true, nil, nil
		}

		// we're not yet finalizing - start the process
		snc, err := r.connectToWorkspaceDaemon(ctx, workspace)
		if err != nil {
			r.finalizerMapLock.Unlock()
			return true, nil, err
		}

		ctx, cancelReq := context.WithTimeout(ctx, time.Duration(r.Config.Timeouts.ContentFinalization))
		r.finalizerMap[workspace.Name] = cancelReq
		r.finalizerMapLock.Unlock()
		defer func() {
			// we're done disposing - remove from the finalizerMap
			r.finalizerMapLock.Lock()
			delete(r.finalizerMap, workspace.Name)
			r.finalizerMapLock.Unlock()
		}()

		if doSnapshot {
			// if this is a prebuild take a snapshot and mark the workspace
			var res *wsdaemon.TakeSnapshotResponse
			res, err = snc.TakeSnapshot(ctx, &wsdaemon.TakeSnapshotRequest{Id: workspace.Name})
			if err != nil {
				log.Error(err, "cannot take snapshot")
				err = xerrors.Errorf("cannot take snapshot: %v", err)
			}

			if res != nil {
				r.modifyWorkspaceStatus(ctx, workspace.Name, func(status *workspacev1.WorkspaceStatus) error {
					status.Snapshot = res.Url
					return nil
				})
				if err != nil {
					log.Error(err, "cannot mark headless workspace with snapshot - that's one prebuild lost")
					err = xerrors.Errorf("cannot remember snapshot: %v", err)
				}
			}
		}

		// DiposeWorkspace will "degenerate" to a simple wait if the finalization/disposal process is already running.
		// This is unlike the initialization process where we wait for things to finish in a later phase.
		resp, err := snc.DisposeWorkspace(ctx, &wsdaemon.DisposeWorkspaceRequest{
			Id:         workspace.Name,
			Backup:     doBackup,
			BackupLogs: doBackupLogs,
		})
		if resp != nil {
			gitStatus = gitStatusfromContentServiceAPI(resp.GitStatus)
		}
		return true, gitStatus, err
	}

	var (
		dataloss    bool
		backupError error
		gitStatus   *workspacev1.GitStatus
	)
	for i := 0; i < wsdaemonMaxAttempts; i++ {
		didSometing, gs, err := doFinalize()
		if !didSometing {
			// someone else is managing finalization process ... we don't have to bother
			return
		}

		// by default we assume the worst case scenario. If things aren't just as bad, we'll tune it down below.
		dataloss = true
		backupError = err
		gitStatus = gs

		// At this point one of three things may have happened:
		//   1. the context deadline was exceeded, e.g. due to misconfiguration (not enough time to upload) or network issues. We'll try again.
		//   2. the service was unavailable, in which case we'll try again.
		//   3. none of the above, in which case we'll give up
		st, isGRPCError := status.FromError(err)
		if !isGRPCError {
			break
		}

		if (err != nil && strings.Contains(err.Error(), context.DeadlineExceeded.Error())) ||
			st.Code() == codes.Unavailable ||
			st.Code() == codes.Canceled {
			// service is currently unavailable or we did not finish in time - let's wait some time and try again
			time.Sleep(wsdaemonRetryInterval)
			continue
		}

		// service was available, we've tried to do the work and failed. Tell the world about it.
		if (doBackup || doSnapshot) && isGRPCError {
			switch st.Code() {
			case codes.DataLoss:
				// ws-daemon told us that it's lost data
				dataloss = true
			case codes.FailedPrecondition:
				// the workspace content was not in the state we thought it was
				dataloss = true
			}
		}
		break
	}

	var backupFailure string
	if backupError != nil {
		if dataloss {
			backupFailure = backupError.Error()
		} else {
			// internal errors make no difference to the user experience. The backup still worked, we just messed up some
			// state management or cleanup. No need to worry the user.
			log.Error(backupError, "internal error while disposing workspace content")
		}
	}
	err := r.modifyWorkspaceStatus(ctx, workspace.Name, func(status *workspacev1.WorkspaceStatus) error {
		status.Conditions = append(status.Conditions, metav1.Condition{
			Type:               string(workspacev1.WorkspaceConditionBackupComplete),
			Status:             metav1.ConditionTrue,
			LastTransitionTime: metav1.Now(),
		})
		if backupFailure != "" {
			status.Conditions = append(status.Conditions, metav1.Condition{
				Type:               string(workspacev1.WorkspaceConditionBackupFailure),
				Status:             metav1.ConditionTrue,
				LastTransitionTime: metav1.Now(),
				Message:            backupFailure,
			})
		}
		status.GitStatus = gitStatus
		log.V(1).Info("setting disposal status", "instanceID", workspace.Name)
		return nil
	})
	if err != nil {
		log.Error(err, "cannot update workspace disposal status")
	}
}

func gitStatusfromContentServiceAPI(s *csapi.GitStatus) *workspacev1.GitStatus {
	return &workspacev1.GitStatus{
		Branch:               s.Branch,
		LatestCommit:         s.LatestCommit,
		UncommitedFiles:      s.UncommitedFiles,
		TotalUncommitedFiles: s.TotalUncommitedFiles,
		UntrackedFiles:       s.UntrackedFiles,
		TotalUntrackedFiles:  s.TotalUntrackedFiles,
		UnpushedCommits:      s.UnpushedCommits,
		TotalUnpushedCommits: s.TotalUnpushedCommits,
	}
}

// connectToWorkspaceDaemon establishes a connection to the ws-daemon daemon running on the node of the pod/workspace.
func (r *WorkspaceReconciler) connectToWorkspaceDaemon(ctx context.Context, workspace *workspacev1.Workspace) (wcsClient wsdaemon.WorkspaceContentServiceClient, err error) {
	log := log.FromContext(ctx)

	var nodeName string
	if workspace.Status.Runtime != nil {
		nodeName = workspace.Status.Runtime.NodeName
	}
	if nodeName == "" {
		return nil, xerrors.Errorf("no nodeName found")
	}

	var podList corev1.PodList
	err = r.Client.List(ctx, &podList,
		&client.ListOptions{
			Namespace: r.Config.Namespace,
			LabelSelector: labels.SelectorFromSet(labels.Set{
				"component": "ws-daemon",
				"app":       "gitpod",
			}),
		},
	)
	if err != nil {
		return nil, err
	}

	// find the ws-daemon on this node
	var hostIP string
	for _, pod := range podList.Items {
		if pod.Spec.NodeName == nodeName {
			hostIP = pod.Status.PodIP
			break
		}
	}

	if hostIP == "" {
		return nil, xerrors.Errorf("no running ws-daemon pod found")
	}
	conn, err := r.wsdaemonPool.Get(hostIP)
	if err != nil {
		return nil, err
	}

	log.V(1).Info("connecting to ws-daemon", "workspace", workspace.Name, "hostName", nodeName, "wsDaemonHostIP", hostIP)
	return wsdaemon.NewWorkspaceContentServiceClient(conn), nil
}

// modifyWorkspaceStatus modifies a workspace object using the mod function. If the mod function returns a gRPC status error, that error
// is returned directly. If mod returns a non-gRPC error it is turned into one.
func (r *WorkspaceReconciler) modifyWorkspaceStatus(ctx context.Context, id string, mod func(ws *workspacev1.WorkspaceStatus) error) error {
	return retry.RetryOnConflict(retry.DefaultBackoff, func() error {
		var ws workspacev1.Workspace
		err := r.Client.Get(ctx, types.NamespacedName{Namespace: r.Config.Namespace, Name: id}, &ws)
		if err != nil {
			return err
		}

		err = mod(&ws.Status)
		if err != nil {
			return err
		}

		return r.Client.Status().Update(ctx, &ws)
	})
}

// initializeWorkspaceContent talks to a ws-daemon daemon on the node of the pod and initializes the workspace content.
// If we're already initializing the workspace, thus function will return immediately. If we were not initializing,
// prior to this call this function returns once initialization is complete.
func (r *WorkspaceReconciler) initializeWorkspaceContent(ctx context.Context, ws *workspacev1.Workspace) (err error) {
	if ws.Spec.Ownership.Owner == "" {
		return xerrors.Errorf("workspace instance %s has no owner", ws.Name)
	}

	var (
		initializer     csapi.WorkspaceInitializer
		snc             wsdaemon.WorkspaceContentServiceClient
		contentManifest []byte
	)
	// The function below deliniates the initializer lock. It's just there so that we can
	// defer the unlock call, thus making sure we actually call it.
	err = func() error {
		r.initializerMapLock.Lock()
		defer r.initializerMapLock.Unlock()

		_, alreadyInitializing := r.initializerMap[ws.Name]
		if alreadyInitializing {
			return nil
		}

		// There is no need to emit this span if the operation is noop.
		span, ctx := tracing.FromContext(ctx, "initializeWorkspace")
		defer tracing.FinishSpan(span, &err)

		err = proto.Unmarshal(ws.Spec.Initializer, &initializer)
		if err != nil {
			return xerrors.Errorf("cannot unmarshal init config: %w", err)
		}

		// connect to the appropriate ws-daemon
		snc, err = r.connectToWorkspaceDaemon(ctx, ws)
		if err != nil {
			return err
		}

		// mark that we're already initialising this workspace
		r.initializerMap[ws.Name] = struct{}{}

		return nil
	}()
	if err != nil {
		return xerrors.Errorf("cannot initialize workspace: %w", err)
	}
	if err == nil && snc == nil {
		// we are already initialising
		return nil
	}
	log.FromContext(ctx).Info("initialising workspace", "workspace", ws.Name)
	_, err = snc.InitWorkspace(ctx, &wsdaemon.InitWorkspaceRequest{
		Id: ws.Name,
		Metadata: &wsdaemon.WorkspaceMetadata{
			Owner:  ws.Spec.Ownership.Owner,
			MetaId: ws.Spec.Ownership.WorkspaceID,
		},
		Initializer:           &initializer,
		FullWorkspaceBackup:   false,
		ContentManifest:       contentManifest,
		RemoteStorageDisabled: ws.Spec.Type == workspacev1.WorkspaceTypeImageBuild,
	})
	if st, ok := status.FromError(err); ok && st.Code() == codes.AlreadyExists {
		// we're already initializing, things are good - we'll wait for it later
		err = nil
	}
	if err != nil {
		return xerrors.Errorf("cannot initialize workspace: %w", err)
	}

	return nil
}

var (
	wsOwnerKey = ".metadata.controller"
	apiGVStr   = workspacev1.GroupVersion.String()
)

// SetupWithManager sets up the controller with the Manager.
func (r *WorkspaceReconciler) SetupWithManager(mgr ctrl.Manager) error {
	idx := func(rawObj client.Object) []string {
		// grab the job object, extract the owner...
		job := rawObj.(*corev1.Pod)
		owner := metav1.GetControllerOf(job)
		if owner == nil {
			return nil
		}
		// ...make sure it's a workspace...
		if owner.APIVersion != apiGVStr || owner.Kind != "Workspace" {
			return nil
		}

		// ...and if so, return it
		return []string{owner.Name}
	}
	err := mgr.GetFieldIndexer().IndexField(context.Background(), &corev1.Pod{}, wsOwnerKey, idx)
	if err != nil {
		return err
	}

	return ctrl.NewControllerManagedBy(mgr).
		For(&workspacev1.Workspace{}).
		Owns(&corev1.Pod{}).
		Complete(r)
}

// newWsdaemonConnectionFactory creates a new wsdaemon connection factory based on the wsmanager configuration
func newWsdaemonConnectionFactory(managerConfig config.Configuration) (grpcpool.Factory, error) {
	cfg := managerConfig.WorkspaceDaemon
	// TODO(cw): add client-side gRPC metrics
	grpcOpts := common_grpc.DefaultClientOptions()
	if cfg.TLS.Authority != "" || cfg.TLS.Certificate != "" && cfg.TLS.PrivateKey != "" {
		tlsConfig, err := common_grpc.ClientAuthTLSConfig(
			cfg.TLS.Authority, cfg.TLS.Certificate, cfg.TLS.PrivateKey,
			common_grpc.WithSetRootCAs(true),
			common_grpc.WithServerName("wsdaemon"),
		)
		if err != nil {
			return nil, xerrors.Errorf("cannot load ws-daemon certs: %w", err)
		}

		grpcOpts = append(grpcOpts, grpc.WithTransportCredentials(credentials.NewTLS(tlsConfig)))
	} else {
		grpcOpts = append(grpcOpts, grpc.WithInsecure())
	}
	port := cfg.Port

	return func(host string) (*grpc.ClientConn, error) {
		var (
			addr           = fmt.Sprintf("%s:%d", host, port)
			conctx, cancel = context.WithTimeout(context.Background(), wsdaemonDialTimeout)
		)
		// Canceling conctx becomes a no-op once the connection is established.
		// Because we use WithBlock in the opts DialContext will only return once the connection is established.
		// Hence upon leaving this function we can safely cancel the conctx.
		defer cancel()

		conn, err := grpc.DialContext(conctx, addr, grpcOpts...)
		if err != nil {
			return nil, xerrors.Errorf("cannot connect to workspace daemon: %w", err)
		}
		return conn, nil
	}, nil
}

func checkWSDaemonEndpoint(namespace string, clientset client.Client) func(string) bool {
	return func(address string) bool {
		var podList corev1.PodList
		err := clientset.List(context.Background(), &podList,
			&client.ListOptions{
				Namespace: namespace,
				LabelSelector: labels.SelectorFromSet(labels.Set{
					"component": "ws-daemon",
					"app":       "gitpod",
				}),
			},
		)
		if err != nil {
			// log.Error("cannot list ws-daemon pods")
			return false
		}

		for _, pod := range podList.Items {
			if pod.Status.PodIP == address {
				return true
			}
		}

		return false
	}
}