mirror of
https://github.com/gitpod-io/gitpod.git
synced 2025-12-08 17:36:30 +00:00
928 lines
32 KiB
Go
928 lines
32 KiB
Go
// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
|
|
// Licensed under the GNU Affero General Public License (AGPL).
|
|
// See License-AGPL.txt in the project root for license information.
|
|
|
|
package manager
|
|
|
|
import (
|
|
"bytes"
|
|
"encoding/json"
|
|
"fmt"
|
|
"strings"
|
|
"time"
|
|
|
|
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
|
|
"github.com/gitpod-io/gitpod/common-go/log"
|
|
"github.com/gitpod-io/gitpod/common-go/util"
|
|
regapi "github.com/gitpod-io/gitpod/registry-facade/api"
|
|
"github.com/gitpod-io/gitpod/ws-manager/api"
|
|
"github.com/golang/protobuf/ptypes"
|
|
|
|
"github.com/sirupsen/logrus"
|
|
"golang.org/x/xerrors"
|
|
corev1 "k8s.io/api/core/v1"
|
|
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
|
|
"k8s.io/apimachinery/pkg/runtime"
|
|
)
|
|
|
|
const (
|
|
// containerKilledExitCode is the exit code Kubernetes uses for a container which was killed by the system.
|
|
// We expect such containers to be restarted by Kubernetes if they're supposed to be running.
|
|
// We never deliberately terminate a container like this.
|
|
containerKilledExitCode = 137
|
|
|
|
// containerUnknownExitCode is the exit code containerd uses if it cannot determine the cause/exit status of
|
|
// a stopped container.
|
|
containerUnknownExitCode = 255
|
|
)
|
|
|
|
// Scheme is the default instance of runtime.Scheme to which types in the Kubernetes API are already registered.
|
|
// Inspired by https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/scheme/scheme.go
|
|
var scheme = runtime.NewScheme()
|
|
|
|
// register the core schema
|
|
func init() {
|
|
err := corev1.AddToScheme(scheme)
|
|
if err != nil {
|
|
log.WithError(err).Fatal("cannot register Kubernetes core schema - this should never happen")
|
|
}
|
|
}
|
|
|
|
// workspaceObjects contains all Kubernetes objects required to compute the status of a workspace
|
|
type workspaceObjects struct {
|
|
Pod *corev1.Pod `json:"pod"`
|
|
TheiaService *corev1.Service `json:"theiaService,omitempty"`
|
|
PortsService *corev1.Service `json:"portsService,omitempty"`
|
|
Events []corev1.Event `json:"events,omitempty"`
|
|
|
|
// PLIS is Pod Lifecycle Independent State which we use to store state if there's no more appropriate place.
|
|
// This is really a last resort and should only be used if there really is no other means of storing the state.
|
|
PLIS *corev1.ConfigMap `json:"plis,omitempty"`
|
|
}
|
|
|
|
// GetOWI produces the owner, workspace, instance tripple that we use for tracing and logging
|
|
func (wso *workspaceObjects) GetOWI() logrus.Fields {
|
|
if wso.Pod != nil {
|
|
return wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)
|
|
}
|
|
if wso.PLIS != nil {
|
|
return wsk8s.GetOWIFromObject(&wso.PLIS.ObjectMeta)
|
|
}
|
|
return logrus.Fields{}
|
|
}
|
|
|
|
// IsWorkspaceHeadless returns true if the workspace described by these objects is headless
|
|
func (wso *workspaceObjects) IsWorkspaceHeadless() bool {
|
|
if wso.Pod != nil {
|
|
val, ok := wso.Pod.ObjectMeta.Labels[headlessLabel]
|
|
return ok && val == "true"
|
|
}
|
|
if wso.PLIS != nil {
|
|
val, ok := wso.PLIS.ObjectMeta.Labels[headlessLabel]
|
|
return ok && val == "true"
|
|
}
|
|
return false
|
|
}
|
|
|
|
func (wso *workspaceObjects) WorkspaceType() (api.WorkspaceType, error) {
|
|
var meta *metav1.ObjectMeta
|
|
if wso.Pod != nil {
|
|
meta = &wso.Pod.ObjectMeta
|
|
} else if wso.PLIS != nil {
|
|
meta = &wso.PLIS.ObjectMeta
|
|
} else {
|
|
// we don't know anything about this pod - assume it's a regular pod
|
|
return api.WorkspaceType_REGULAR, xerrors.Errorf("cannot determine pod type")
|
|
}
|
|
|
|
lbl, ok := meta.Labels[wsk8s.TypeLabel]
|
|
if !ok {
|
|
// LEGACY
|
|
// this is a legacy pod without explicit workspace type. If it's headless it must be a prebuild, otherwise it's a regular workspace
|
|
var tpe api.WorkspaceType
|
|
if wso.IsWorkspaceHeadless() {
|
|
tpe = api.WorkspaceType_PREBUILD
|
|
} else {
|
|
tpe = api.WorkspaceType_REGULAR
|
|
}
|
|
log.WithFields(wsk8s.GetOWIFromObject(meta)).WithField("workspaceType", tpe).Info("determining type of legacy pod")
|
|
return tpe, nil
|
|
}
|
|
|
|
val, ok := api.WorkspaceType_value[strings.ToUpper(lbl)]
|
|
if !ok {
|
|
// pod has invalid type label - we assume it's a regular pod
|
|
return api.WorkspaceType_REGULAR, xerrors.Errorf("pod has invalid type label: %s", lbl)
|
|
}
|
|
|
|
return api.WorkspaceType(val), nil
|
|
}
|
|
|
|
// WorkspaceID returns the ID of the workspace
|
|
func (wso *workspaceObjects) WorkspaceID() (id string, ok bool) {
|
|
if wso.Pod != nil {
|
|
r, ok := wso.Pod.Annotations[workspaceIDAnnotation]
|
|
if ok {
|
|
return r, true
|
|
}
|
|
}
|
|
if wso.PLIS != nil {
|
|
r, ok := wso.PLIS.Annotations[workspaceIDAnnotation]
|
|
if ok {
|
|
return r, true
|
|
}
|
|
}
|
|
|
|
return "", false
|
|
}
|
|
|
|
func (wso *workspaceObjects) WasEverReady() (res bool) {
|
|
const deprecatedWorkspaceReadyAnnotation = "gitpod/ready"
|
|
|
|
check := func(a map[string]string) bool {
|
|
// we may still have some legacy pods running that used the former deprecatedWorkspaceReadyAnnotation
|
|
// If we see that flag we have to give that one precedence.
|
|
if _, ok := a[deprecatedWorkspaceReadyAnnotation]; ok {
|
|
return true
|
|
}
|
|
|
|
_, neverReady := a[workspaceNeverReadyAnnotation]
|
|
return !neverReady
|
|
}
|
|
|
|
if wso.Pod != nil {
|
|
return check(wso.Pod.Annotations)
|
|
}
|
|
if wso.PLIS != nil {
|
|
return check(wso.PLIS.Annotations)
|
|
}
|
|
|
|
// We assume the pod was ready by default, even if we have nothing to show for it.
|
|
// The real world has shown that this produces the more favorable failure modes.
|
|
return true
|
|
}
|
|
|
|
// HostIP returns the IP of the node this workspace is/was deployed to. If this workspace has never been deployed anywhere, HostIP returns an empty string.
|
|
func (wso *workspaceObjects) HostIP() string {
|
|
if wso.Pod != nil {
|
|
return wso.Pod.Status.HostIP
|
|
}
|
|
if wso.PLIS != nil {
|
|
plis, _ := unmarshalPodLifecycleIndependentState(wso.PLIS)
|
|
if plis != nil {
|
|
return plis.HostIP
|
|
}
|
|
}
|
|
return ""
|
|
}
|
|
|
|
func (m *Manager) getWorkspaceObjects(pod *corev1.Pod) (*workspaceObjects, error) {
|
|
wso := &workspaceObjects{Pod: pod}
|
|
err := m.completeWorkspaceObjects(wso)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("getWorkspaceObjects: %w", err)
|
|
}
|
|
return wso, nil
|
|
}
|
|
|
|
// completeWorkspaceObjects finds the remaining Kubernetes objects based on the pod description
|
|
// or pod lifecycle indepedent state.
|
|
func (m *Manager) completeWorkspaceObjects(wso *workspaceObjects) error {
|
|
if wso.Pod == nil && wso.PLIS == nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: need either pod or lifecycle independent state")
|
|
}
|
|
|
|
// find pod if we're working on PLIS alone so far
|
|
if wso.Pod == nil {
|
|
workspaceID, ok := wso.PLIS.ObjectMeta.Annotations[workspaceIDAnnotation]
|
|
if !ok {
|
|
return xerrors.Errorf("cannot find %s annotation on %s", workspaceIDAnnotation, wso.PLIS.Name)
|
|
}
|
|
|
|
pod, err := m.findWorkspacePod(workspaceID)
|
|
if err == nil {
|
|
wso.Pod = pod
|
|
}
|
|
|
|
if !isKubernetesObjNotFoundError(err) && err != nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
|
|
}
|
|
}
|
|
|
|
// find our service prefix to see if the services still exist
|
|
servicePrefix := ""
|
|
if wso.Pod != nil {
|
|
servicePrefix = wso.Pod.Annotations[servicePrefixAnnotation]
|
|
}
|
|
if servicePrefix == "" && wso.PLIS != nil {
|
|
servicePrefix = wso.PLIS.Annotations[servicePrefixAnnotation]
|
|
}
|
|
if servicePrefix == "" {
|
|
return xerrors.Errorf("completeWorkspaceObjects: no service prefix found")
|
|
}
|
|
serviceClient := m.Clientset.CoreV1().Services(m.Config.Namespace)
|
|
if wso.TheiaService == nil {
|
|
service, err := serviceClient.Get(getTheiaServiceName(servicePrefix), metav1.GetOptions{})
|
|
if err == nil {
|
|
wso.TheiaService = service
|
|
}
|
|
|
|
if !isKubernetesObjNotFoundError(err) && err != nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
|
|
}
|
|
}
|
|
if wso.PortsService == nil {
|
|
service, err := serviceClient.Get(getPortsServiceName(servicePrefix), metav1.GetOptions{})
|
|
if err == nil {
|
|
wso.PortsService = service
|
|
}
|
|
|
|
if !isKubernetesObjNotFoundError(err) && err != nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
|
|
}
|
|
}
|
|
|
|
// find pod events - this only makes sense if we still have a pod
|
|
if wso.Pod != nil {
|
|
if wso.Events == nil && wso.Pod != nil {
|
|
events, err := m.Clientset.CoreV1().Events(m.Config.Namespace).Search(scheme, wso.Pod)
|
|
if err != nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
|
|
}
|
|
|
|
wso.Events = make([]corev1.Event, len(events.Items))
|
|
copy(wso.Events, events.Items)
|
|
}
|
|
}
|
|
|
|
// if we don't have PLIS but a pod, try and find the PLIS
|
|
if wso.PLIS == nil {
|
|
workspaceID, ok := wso.Pod.Annotations[workspaceIDAnnotation]
|
|
if !ok {
|
|
return fmt.Errorf("cannot act on pod %s: has no %s annotation", wso.Pod.Name, workspaceIDAnnotation)
|
|
}
|
|
|
|
plis, err := m.Clientset.CoreV1().ConfigMaps(m.Config.Namespace).Get(getPodLifecycleIndependentCfgMapName(workspaceID), metav1.GetOptions{})
|
|
if !isKubernetesObjNotFoundError(err) && err != nil {
|
|
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
|
|
}
|
|
|
|
wso.PLIS = plis
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
func getPodLifecycleIndependentCfgMapName(workspaceID string) string {
|
|
return fmt.Sprintf("plis-%s", workspaceID)
|
|
}
|
|
|
|
func (m *Manager) getWorkspaceStatus(wso workspaceObjects) (*api.WorkspaceStatus, error) {
|
|
id, ok := wso.WorkspaceID()
|
|
if !ok {
|
|
return nil, xerrors.Errorf("cannot get %s annotation from %s", workspaceIDAnnotation, wso.Pod.Name)
|
|
}
|
|
|
|
var status *api.WorkspaceStatus
|
|
if wso.Pod == nil {
|
|
// Status computation depends heavily on the workspace pod, as that pod contains the better part of our
|
|
// configuration and status. It is possible that we do not have a pod yet/anymore and have to rely on the
|
|
// pod lifecycle independent state to come up with our status.
|
|
//
|
|
// In that case we fall back to some reduced status computation which uses the PLIS only.
|
|
|
|
var err error
|
|
status, err = m.getWorkspaceStatusFromPLIS(wso)
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
} else {
|
|
// we have a workspace pod - use that to compute the status from scratch (as compared to pulling it out of the PLIS alone)
|
|
workspaceContainer := getContainer(wso.Pod, "workspace")
|
|
if workspaceContainer == nil {
|
|
return nil, xerrors.Errorf("workspace pod for %s is degenerate - does not have workspace container", id)
|
|
}
|
|
|
|
wsurl, ok := wso.Pod.Annotations[workspaceURLAnnotation]
|
|
if !ok {
|
|
return nil, xerrors.Errorf("pod %s has no %s annotation", wso.Pod.Name, workspaceURLAnnotation)
|
|
}
|
|
|
|
tpe, err := wso.WorkspaceType()
|
|
if err != nil {
|
|
return nil, err
|
|
}
|
|
|
|
var timeout string
|
|
if t := m.Config.Timeouts.RegularWorkspace; t > 0 {
|
|
timeout = t.String()
|
|
}
|
|
if v, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
|
|
timeout = v
|
|
}
|
|
|
|
var (
|
|
wsImage = workspaceContainer.Image
|
|
ideImage string
|
|
)
|
|
if ispec, ok := wso.Pod.Annotations[workspaceImageSpecAnnotation]; ok {
|
|
spec, err := regapi.ImageSpecFromBase64(ispec)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("invalid iamge spec: %w", err)
|
|
}
|
|
wsImage = spec.BaseRef
|
|
ideImage = spec.IdeRef
|
|
}
|
|
|
|
ownerToken, ok := wso.Pod.Annotations[ownerTokenAnnotation]
|
|
if !ok {
|
|
log.WithFields(wso.GetOWI()).Warn("pod has no owner token. is this a legacy pod?")
|
|
}
|
|
admission := api.AdmissionLevel_ADMIT_OWNER_ONLY
|
|
if av, ok := api.AdmissionLevel_value[strings.ToUpper(wso.Pod.Annotations[workspaceAdmissionAnnotation])]; ok {
|
|
admission = api.AdmissionLevel(av)
|
|
}
|
|
|
|
status = &api.WorkspaceStatus{
|
|
Id: id,
|
|
Metadata: getWorkspaceMetadata(wso.Pod),
|
|
Spec: &api.WorkspaceSpec{
|
|
Headless: wso.IsWorkspaceHeadless(),
|
|
WorkspaceImage: wsImage,
|
|
IdeImage: ideImage,
|
|
Url: wsurl,
|
|
Type: tpe,
|
|
Timeout: timeout,
|
|
},
|
|
Conditions: &api.WorkspaceConditions{
|
|
Snapshot: wso.Pod.Annotations[workspaceSnapshotAnnotation],
|
|
},
|
|
Runtime: &api.WorkspaceRuntimeInfo{
|
|
NodeName: wso.Pod.Spec.NodeName,
|
|
},
|
|
Auth: &api.WorkspaceAuthentication{
|
|
Admission: admission,
|
|
OwnerToken: ownerToken,
|
|
},
|
|
}
|
|
|
|
// pod first, plis later
|
|
err = m.extractStatusFromPod(status, wso)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("cannot get workspace status: %w", err)
|
|
}
|
|
|
|
if wso.PLIS != nil {
|
|
plis, err := unmarshalPodLifecycleIndependentState(wso.PLIS)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("cannot get workspace status: %w", err)
|
|
}
|
|
|
|
err = extractStatusFromPLIS(status, &wso, plis)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("cannot get workspace status: %w", err)
|
|
}
|
|
}
|
|
}
|
|
|
|
exposedPorts := []*api.PortSpec{}
|
|
if wso.PortsService != nil {
|
|
service := wso.PortsService
|
|
|
|
for _, p := range service.Spec.Ports {
|
|
port := &api.PortSpec{
|
|
Port: uint32(p.Port),
|
|
Target: uint32(p.TargetPort.IntValue()),
|
|
Visibility: portNameToVisibility(p.Name),
|
|
Url: service.Annotations[fmt.Sprintf("gitpod/port-url-%d", p.Port)],
|
|
}
|
|
|
|
// enforce the cannonical form where target defaults to port
|
|
if port.Port == port.Target {
|
|
port.Target = 0
|
|
}
|
|
|
|
exposedPorts = append(exposedPorts, port)
|
|
}
|
|
}
|
|
status.Spec.ExposedPorts = exposedPorts
|
|
|
|
var serviceExists api.WorkspaceConditionBool
|
|
if wso.TheiaService != nil || wso.PortsService != nil {
|
|
serviceExists = api.WorkspaceConditionBool_TRUE
|
|
} else {
|
|
serviceExists = api.WorkspaceConditionBool_FALSE
|
|
}
|
|
status.Conditions.ServiceExists = serviceExists
|
|
|
|
if wso.Pod == nil {
|
|
status.Conditions.Deployed = api.WorkspaceConditionBool_FALSE
|
|
} else {
|
|
status.Conditions.Deployed = api.WorkspaceConditionBool_TRUE
|
|
}
|
|
|
|
return status, nil
|
|
}
|
|
|
|
func getContainer(pod *corev1.Pod, name string) *corev1.Container {
|
|
for _, c := range pod.Spec.Containers {
|
|
if c.Name == name {
|
|
return &c
|
|
}
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// getWorkspaceMetadata extracts a workspace's metadata from pod labels
|
|
func getWorkspaceMetadata(pod *corev1.Pod) *api.WorkspaceMetadata {
|
|
started, _ := ptypes.TimestampProto(pod.CreationTimestamp.Time)
|
|
return &api.WorkspaceMetadata{
|
|
Owner: pod.ObjectMeta.Labels[wsk8s.OwnerLabel],
|
|
MetaId: pod.ObjectMeta.Labels[wsk8s.MetaIDLabel],
|
|
StartedAt: started,
|
|
}
|
|
}
|
|
|
|
func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspaceObjects) error {
|
|
pod := wso.Pod
|
|
|
|
// check failure states, i.e. determine value of result.Failed
|
|
failure, phase := extractFailure(wso)
|
|
result.Conditions.Failed = failure
|
|
if phase != nil {
|
|
result.Phase = *phase
|
|
return nil
|
|
}
|
|
if reason, timedout := pod.Annotations[workspaceTimedOutAnnotation]; timedout {
|
|
if reason == "" {
|
|
reason = "workspace timed out for an unknown reason"
|
|
}
|
|
result.Conditions.Timeout = reason
|
|
}
|
|
|
|
if isPodBeingDeleted(pod) {
|
|
result.Phase = api.WorkspacePhase_STOPPING
|
|
|
|
_, podFailedBeforeBeingStopped := pod.Annotations[workspaceFailedBeforeStoppingAnnotation]
|
|
if !podFailedBeforeBeingStopped {
|
|
// While the pod is being deleted we do not care or want to know about any failure state.
|
|
// If the pod got stopped because it failed we will have sent out a Stopping status with a "failure"
|
|
result.Conditions.Failed = ""
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
status := pod.Status
|
|
if status.Phase == corev1.PodPending {
|
|
// check if any container is still pulling images
|
|
for _, cs := range status.ContainerStatuses {
|
|
if cs.State.Waiting != nil {
|
|
if cs.State.Waiting.Reason != "ContainerCreating" {
|
|
continue
|
|
}
|
|
|
|
if wso.WasEverReady() {
|
|
// Workspace was ready at some point but has become unready. This should never happen due to the
|
|
// very conservative readiness probe. If it does happen however, we don't want to fall back to a
|
|
// creating phase.
|
|
log.WithField("pod", pod).Warn("once ready pod became unready - this should not happen")
|
|
continue
|
|
}
|
|
|
|
result.Phase = api.WorkspacePhase_CREATING
|
|
result.Conditions.PullingImages = api.WorkspaceConditionBool_TRUE
|
|
result.Message = "containers are being created"
|
|
return nil
|
|
}
|
|
}
|
|
|
|
result.Phase = api.WorkspacePhase_PENDING
|
|
result.Message = "pod is pending"
|
|
return nil
|
|
} else if status.Phase == corev1.PodRunning {
|
|
if firstUserActivity, ok := wso.Pod.Annotations[firstUserActivityAnnotation]; ok {
|
|
t, err := time.Parse(time.RFC3339Nano, firstUserActivity)
|
|
if err != nil {
|
|
return xerrors.Errorf("cannot parse firstUserActivity: %w", err)
|
|
}
|
|
pt, err := ptypes.TimestampProto(t)
|
|
if err != nil {
|
|
return xerrors.Errorf("cannot convert firstUserActivity: %w", err)
|
|
}
|
|
result.Conditions.FirstUserActivity = pt
|
|
}
|
|
|
|
for _, cs := range status.ContainerStatuses {
|
|
// containers that were terminated are not ready, but may have been
|
|
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerUnknownExitCode {
|
|
// the container was stopped for an unknown reason.
|
|
// this means that the workspace is currently interrupted.
|
|
result.Phase = api.WorkspacePhase_INTERRUPTED
|
|
|
|
// was it caused by a network outage?
|
|
if hasNetworkNotReadyEvent(wso) {
|
|
// this might be a false positive in case the workspace recovers but has an exit code 255 for
|
|
// another reason afterwards. As of now we're lagging the data to handle it better.
|
|
result.Conditions.NetworkNotReady = api.WorkspaceConditionBool_TRUE
|
|
result.Message = "container network not ready - workspace should recover shortly"
|
|
} else {
|
|
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
|
|
}
|
|
|
|
return nil
|
|
}
|
|
|
|
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerKilledExitCode {
|
|
// we have a container which was killed with exit code 137 recently and is (hopefully) restarting at the moment.
|
|
// this means that the workspace is currently interrupted.
|
|
result.Phase = api.WorkspacePhase_INTERRUPTED
|
|
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
|
|
return nil
|
|
}
|
|
|
|
if !cs.Ready && cs.RestartCount > 0 {
|
|
// this container was running before, but is currently recovering from an interruption
|
|
result.Phase = api.WorkspacePhase_INTERRUPTED
|
|
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace is recovering", cs.Name)
|
|
return nil
|
|
}
|
|
|
|
_, neverWereReady := pod.Annotations[workspaceNeverReadyAnnotation]
|
|
if neverWereReady && !cs.Ready {
|
|
// container isn't ready yet (never has been), thus we're still in the creating phase.
|
|
result.Phase = api.WorkspacePhase_CREATING
|
|
result.Message = "containers are starting"
|
|
result.Conditions.PullingImages = api.WorkspaceConditionBool_FALSE
|
|
return nil
|
|
}
|
|
}
|
|
|
|
if wso.IsWorkspaceHeadless() {
|
|
// headless workspaces don't expose a public service and thus cannot be asked about their status.
|
|
// once kubernetes reports the workspace running, so do we.
|
|
result.Phase = api.WorkspacePhase_RUNNING
|
|
return nil
|
|
}
|
|
|
|
if _, neverReady := pod.Annotations[workspaceNeverReadyAnnotation]; !neverReady {
|
|
// workspcae has been marked ready by a workspace-ready probe of the monitor
|
|
result.Phase = api.WorkspacePhase_RUNNING
|
|
return nil
|
|
}
|
|
|
|
// workspace has not yet been marked ready by one of monitor's probes. It must be initializing then.
|
|
result.Phase = api.WorkspacePhase_INITIALIZING
|
|
result.Message = "workspace initializer is running"
|
|
return nil
|
|
} else if status.Phase == corev1.PodUnknown {
|
|
result.Phase = api.WorkspacePhase_UNKNOWN
|
|
result.Message = "Kubernetes reports workspace phase as unknown"
|
|
return nil
|
|
}
|
|
|
|
// If we've extracted a failure reason earlier, but no explicit phase at the time, as well as in this
|
|
// function, we resort to unknown.
|
|
// This is different to being unable to determine the workspace phase - the phase is unknown due to an unknown failure (by definition).
|
|
if failure != "" {
|
|
result.Phase = api.WorkspacePhase_UNKNOWN
|
|
return nil
|
|
}
|
|
|
|
log.WithFields(wsk8s.GetOWIFromObject(&pod.ObjectMeta)).WithField("pod", pod).Debug("cannot determine workspace phase")
|
|
result.Phase = api.WorkspacePhase_UNKNOWN
|
|
result.Message = "cannot determine workspace phase. We should never get here."
|
|
return nil
|
|
}
|
|
|
|
// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
|
|
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
|
|
func extractFailure(wso workspaceObjects) (string, *api.WorkspacePhase) {
|
|
pod := wso.Pod
|
|
|
|
// if the workspace was explicitely marked as failed that also constitutes a failure reason
|
|
reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation]
|
|
if explicitFailure {
|
|
return reason, nil
|
|
}
|
|
|
|
status := pod.Status
|
|
if status.Phase == corev1.PodFailed && (status.Reason != "" || status.Message != "") {
|
|
// Don't force the phase to UNKNONWN here to leave a chance that we may detect the actual phase of
|
|
// the workspace, e.g. stopping.
|
|
return fmt.Sprintf("%s: %s", status.Reason, status.Message), nil
|
|
}
|
|
|
|
for _, cs := range status.ContainerStatuses {
|
|
if cs.State.Waiting != nil {
|
|
if cs.State.Waiting.Reason == "ImagePullBackOff" || cs.State.Waiting.Reason == "ErrImagePull" {
|
|
// If the image pull failed we were definitely in the api.WorkspacePhase_CREATING phase,
|
|
// unless of course this pod has been deleted already.
|
|
var res api.WorkspacePhase
|
|
if isPodBeingDeleted(pod) {
|
|
res = api.WorkspacePhase_STOPPING
|
|
} else {
|
|
res = api.WorkspacePhase_CREATING
|
|
}
|
|
return fmt.Sprintf("cannot pull image: %s", cs.State.Waiting.Message), &res
|
|
}
|
|
}
|
|
|
|
terminationState := cs.State.Terminated
|
|
if terminationState == nil {
|
|
terminationState = cs.LastTerminationState.Terminated
|
|
}
|
|
if terminationState != nil {
|
|
// a workspace terminated container is not neccesarily bad. During shutdown workspaces containers
|
|
// can go in this state and that's ok. However, if the workspace was shutting down due to deletion,
|
|
// we would not be here as we've checked for a DeletionTimestamp prior. So let's find out why the
|
|
// container is terminating.
|
|
if terminationState.Message != "" {
|
|
// the container itself told us why it was terminated - use that as failure reason
|
|
return extractFailureFromLogs([]byte(terminationState.Message)), nil
|
|
} else if terminationState.Reason == "Error" {
|
|
if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerKilledExitCode {
|
|
return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), nil
|
|
}
|
|
} else if terminationState.Reason == "Completed" {
|
|
return fmt.Sprintf("container %s completed; containers of a workspace pod are not supposed to do that", cs.Name), nil
|
|
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode {
|
|
// if a container is terminated and it wasn't because of either:
|
|
// - regular shutdown
|
|
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue and is handled in extractStatusFromPod)
|
|
// - another known error
|
|
// then we report it as UNKNOWN
|
|
res := api.WorkspacePhase_UNKNOWN
|
|
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), &res
|
|
}
|
|
}
|
|
}
|
|
|
|
// some failure conditions are only evident from the "events" that kubernetes stores, e.g. events coming from the kubelet
|
|
for _, evt := range wso.Events {
|
|
if evt.Reason != "FailedMount" {
|
|
continue
|
|
}
|
|
|
|
// ideally we do not just use evt.Message as failure reason because it contains internal paths and is not useful for the user
|
|
if strings.Contains(evt.Message, theiaVolumeName) {
|
|
return "cannot mount Theia", nil
|
|
} else if strings.Contains(evt.Message, workspaceVolumeName) {
|
|
return "cannot mount workspace", nil
|
|
} else {
|
|
// if this happens we did not do a good job because that means we've introduced another volume to the pod
|
|
// but did not consider that mounting it might fail.
|
|
return evt.Message, nil
|
|
}
|
|
}
|
|
|
|
return "", nil
|
|
}
|
|
|
|
// extractFailureFromLogs attempts to extract the last error message from a workspace
|
|
// container's log output.
|
|
func extractFailureFromLogs(logs []byte) string {
|
|
var sep = []byte("\n")
|
|
var msg struct {
|
|
Error string `json:"error"`
|
|
Message string `json:"message"`
|
|
}
|
|
|
|
var nidx int
|
|
for idx := bytes.LastIndex(logs, sep); idx > 0; idx = nidx {
|
|
nidx = bytes.LastIndex(logs[:idx], sep)
|
|
if nidx < 0 {
|
|
nidx = 0
|
|
}
|
|
|
|
line := logs[nidx:idx]
|
|
err := json.Unmarshal(line, &msg)
|
|
if err != nil {
|
|
continue
|
|
}
|
|
|
|
if msg.Message == "" {
|
|
continue
|
|
}
|
|
|
|
if msg.Error == "" {
|
|
return msg.Message
|
|
}
|
|
|
|
return msg.Message + ": " + msg.Error
|
|
}
|
|
|
|
return string(logs)
|
|
}
|
|
|
|
// isPodBeingDeleted returns true if the pod is currently being stopped/deleted
|
|
func isPodBeingDeleted(pod *corev1.Pod) bool {
|
|
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
|
|
return pod.ObjectMeta.DeletionTimestamp != nil
|
|
}
|
|
|
|
type activity string
|
|
|
|
const (
|
|
activityInit activity = "initialization"
|
|
activityStartup activity = "startup"
|
|
activityCreatingContainers activity = "creating containers"
|
|
activityPullingImages activity = "pulling images"
|
|
activityRunningHeadless activity = "running the headless workspace"
|
|
activityNone activity = "period of inactivity"
|
|
activityClosed activity = "after being closed"
|
|
activityInterrupted activity = "workspace interruption"
|
|
activityStopping activity = "stopping"
|
|
activityBackup activity = "backup"
|
|
)
|
|
|
|
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
|
|
// This function does NOT use the workspaceTimedoutAnnotation, but rather is used to set that annotation in the first place.
|
|
func (m *Manager) isWorkspaceTimedOut(wso workspaceObjects) (reason string, err error) {
|
|
workspaceID, ok := wso.WorkspaceID()
|
|
if !ok {
|
|
return "", xerrors.Errorf("workspace has no %s annotation", workspaceIDAnnotation)
|
|
}
|
|
|
|
status, err := m.getWorkspaceStatus(wso)
|
|
if err != nil {
|
|
return "", xerrors.Errorf("cannot determine workspace phase: %w", err)
|
|
}
|
|
phase := status.Phase
|
|
|
|
decide := func(start time.Time, timeout util.Duration, activity activity) (string, error) {
|
|
td := time.Duration(timeout)
|
|
inactivity := time.Since(start)
|
|
if inactivity < td {
|
|
return "", nil
|
|
}
|
|
|
|
return fmt.Sprintf("workspace timed out after %s took longer than %s", activity, formatDuration(inactivity)), nil
|
|
}
|
|
|
|
if wso.Pod != nil {
|
|
start := wso.Pod.ObjectMeta.CreationTimestamp.Time
|
|
lastActivity := m.getWorkspaceActivity(workspaceID)
|
|
_, isClosed := wso.Pod.Annotations[workspaceClosedAnnotation]
|
|
|
|
switch phase {
|
|
case api.WorkspacePhase_PENDING:
|
|
return decide(start, m.Config.Timeouts.Initialization, activityInit)
|
|
|
|
case api.WorkspacePhase_INITIALIZING:
|
|
return decide(start, m.Config.Timeouts.TotalStartup, activityStartup)
|
|
|
|
case api.WorkspacePhase_CREATING:
|
|
activity := activityCreatingContainers
|
|
if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
|
|
activity = activityPullingImages
|
|
}
|
|
return decide(start, m.Config.Timeouts.TotalStartup, activity)
|
|
|
|
case api.WorkspacePhase_RUNNING:
|
|
if wso.IsWorkspaceHeadless() {
|
|
return decide(start, m.Config.Timeouts.HeadlessWorkspace, activityRunningHeadless)
|
|
} else if lastActivity == nil {
|
|
// the workspace is up and running, but the user has never produced any activity
|
|
return decide(start, m.Config.Timeouts.TotalStartup, activityNone)
|
|
} else if isClosed {
|
|
return decide(*lastActivity, m.Config.Timeouts.AfterClose, activityClosed)
|
|
}
|
|
timeout := m.Config.Timeouts.RegularWorkspace
|
|
if ctv, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
|
|
if ct, err := time.ParseDuration(ctv); err != nil {
|
|
log.WithError(err).WithField("customTimeout", ctv).WithFields(wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)).Warn("pod had custom timeout annotation set, but could not parse its value. Defaulting to ws-manager config.")
|
|
timeout = m.Config.Timeouts.RegularWorkspace
|
|
} else {
|
|
timeout = util.Duration(ct)
|
|
}
|
|
}
|
|
return decide(*lastActivity, timeout, activityNone)
|
|
|
|
case api.WorkspacePhase_INTERRUPTED:
|
|
if lastActivity == nil {
|
|
// the workspace is up and running, but the user has never produced any activity
|
|
return decide(start, m.Config.Timeouts.Interrupted, activityInterrupted)
|
|
}
|
|
return decide(*lastActivity, m.Config.Timeouts.Interrupted, activityInterrupted)
|
|
|
|
default:
|
|
// the only other phases we can be in is stopping and stopped: we leave the stopping timeout to the PLIS branch and don't want to timeout when stopped
|
|
return "", nil
|
|
}
|
|
} else if wso.PLIS != nil {
|
|
plis, err := unmarshalPodLifecycleIndependentState(wso.PLIS)
|
|
if err != nil {
|
|
return "", xerrors.Errorf("cannot determine workspace timeout: %w", err)
|
|
}
|
|
if plis == nil {
|
|
return "", xerrors.Errorf("cannot determine workspace timeout: we have neither pod nor pod lifecycle independent state")
|
|
}
|
|
|
|
switch phase {
|
|
case api.WorkspacePhase_STOPPING:
|
|
if plis.StoppingSince == nil {
|
|
return "", xerrors.Errorf("cannot determine workspace timeout: we don't know when we started stopping")
|
|
}
|
|
activity := activityStopping
|
|
if status.Conditions.FinalBackupComplete != api.WorkspaceConditionBool_TRUE {
|
|
activity = activityBackup
|
|
}
|
|
return decide(*plis.StoppingSince, m.Config.Timeouts.Stopping, activity)
|
|
|
|
case api.WorkspacePhase_STOPPED:
|
|
return "", nil
|
|
|
|
default:
|
|
// if we end up here then somehow we've reached a state where we're neither stopping nor stopped, but also don't
|
|
// have a pod. Dunno how this could ever happen.
|
|
log.WithField("wso", wso).Error("cannot determine workspace timeout: we should never get here (TM)")
|
|
}
|
|
}
|
|
|
|
return "", xerrors.Errorf("cannot determine workspace timeout: we have neither pod nor pod lifecycle independent state")
|
|
}
|
|
|
|
// hasNetworkNotReadyEvent determines if a workspace experienced a network outage - now, or any time in the past - based on
|
|
// its kubernetes events
|
|
func hasNetworkNotReadyEvent(wso workspaceObjects) bool {
|
|
for _, evt := range wso.Events {
|
|
if evt.Reason == "NetworkNotReady" {
|
|
return true
|
|
}
|
|
}
|
|
return false
|
|
}
|
|
|
|
func formatDuration(d time.Duration) string {
|
|
d = d.Round(time.Minute)
|
|
h := d / time.Hour
|
|
d -= h * time.Hour
|
|
m := d / time.Minute
|
|
return fmt.Sprintf("%02dh%02dm", h, m)
|
|
}
|
|
|
|
// errNoPLIS is returned by getWorkspaceStatusFromPLIS if the PLIS configMap is present, but
|
|
// does not contain a PLIS annotation.
|
|
var errNoPLIS = xerrors.Errorf("workspace has no pod lifecycle independent state")
|
|
|
|
// getWorkspaceStatusFromPLIS tries to compute the workspace status from the pod lifecycle independent state alone.
|
|
// For this to work the PLIS must be set and contain the last pod-based status.
|
|
func (m *Manager) getWorkspaceStatusFromPLIS(wso workspaceObjects) (*api.WorkspaceStatus, error) {
|
|
if wso.PLIS == nil {
|
|
return nil, xerrors.Errorf("workspace has no pod lifecycle independent state obj")
|
|
}
|
|
|
|
plis, err := unmarshalPodLifecycleIndependentState(wso.PLIS)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("cannot get status from pod lifecycle independent state: %w", err)
|
|
}
|
|
if plis == nil {
|
|
return nil, errNoPLIS
|
|
}
|
|
|
|
if plis.LastPodStatus == nil {
|
|
return nil, xerrors.Errorf("pod lifecycle independent state does not contain last pod-based status")
|
|
}
|
|
|
|
status := plis.LastPodStatus
|
|
err = extractStatusFromPLIS(status, &wso, plis)
|
|
if err != nil {
|
|
return nil, xerrors.Errorf("cannot get status from pod lifecycle independent state: %w", err)
|
|
}
|
|
|
|
return status, nil
|
|
}
|
|
|
|
// extractStatusFromPLIS takes the information in the pod independent lifecycle and adds it to the status
|
|
func extractStatusFromPLIS(result *api.WorkspaceStatus, wso *workspaceObjects, plis *podLifecycleIndependentState) error {
|
|
if plis == nil || wso.PLIS == nil {
|
|
// no plis => nothing to extract
|
|
return nil
|
|
}
|
|
|
|
if plis.FinalBackupComplete {
|
|
result.Conditions.FinalBackupComplete = api.WorkspaceConditionBool_TRUE
|
|
|
|
if wso.Pod == nil {
|
|
// at this point the pod is gone and the final backup is complete, which means the workspace is finally stopped
|
|
result.Phase = api.WorkspacePhase_STOPPED
|
|
}
|
|
}
|
|
|
|
// if the final backup has failed we need to tell the world (if we haven't done so already)
|
|
if plis.FinalBackupFailure != "" && !strings.Contains(result.Conditions.Failed, "last backup failed") {
|
|
if result.Conditions.Failed != "" {
|
|
result.Conditions.Failed += "; "
|
|
}
|
|
result.Conditions.Failed += fmt.Sprintf("last backup failed: %s. Please contact support if you need the workspace data.", plis.FinalBackupFailure)
|
|
}
|
|
|
|
// if the PLIS has a timeout annotation we must forward that to the conditions
|
|
if timeout, ok := wso.PLIS.Annotations[workspaceTimedOutAnnotation]; result.Conditions.Timeout == "" && ok {
|
|
result.Conditions.Timeout = timeout
|
|
}
|
|
|
|
return nil
|
|
}
|