JenTing Hsiao 8bd4df75e7 ws-manager: skip failure from terminated message
For some reason, the pod is killed with unknown container status and no taints on the underlying node.
Therefore, we skip extracting the failure from the terminated message.

Signed-off-by: JenTing Hsiao <hsiaoairplane@gmail.com>
2023-01-13 06:06:52 +01:00

857 lines
30 KiB
Go

// Copyright (c) 2020 Gitpod GmbH. All rights reserved.
// Licensed under the GNU Affero General Public License (AGPL).
// See License.AGPL.txt in the project root for license information.
package manager
import (
"bytes"
"context"
"encoding/json"
"fmt"
"strings"
"time"
"github.com/sirupsen/logrus"
"golang.org/x/xerrors"
"google.golang.org/protobuf/types/known/timestamppb"
corev1 "k8s.io/api/core/v1"
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
"k8s.io/apimachinery/pkg/runtime"
"k8s.io/apimachinery/pkg/types"
"github.com/gitpod-io/gitpod/common-go/kubernetes"
wsk8s "github.com/gitpod-io/gitpod/common-go/kubernetes"
"github.com/gitpod-io/gitpod/common-go/log"
"github.com/gitpod-io/gitpod/common-go/util"
regapi "github.com/gitpod-io/gitpod/registry-facade/api"
"github.com/gitpod-io/gitpod/ws-manager/api"
)
const (
// containerKilledExitCode is the exit code Kubernetes uses for a container which was killed by the system.
// We expect such containers to be restarted by Kubernetes if they're supposed to be running.
// We never deliberately terminate a container like this.
containerKilledExitCode = 137
// containerUnknownExitCode is the exit code containerd uses if it cannot determine the cause/exit status of
// a stopped container.
containerUnknownExitCode = 255
)
// Scheme is the default instance of runtime.Scheme to which types in the Kubernetes API are already registered.
// Inspired by https://github.com/kubernetes/kubernetes/blob/master/pkg/kubectl/scheme/scheme.go
var scheme = runtime.NewScheme()
// register the core schema
func init() {
err := corev1.AddToScheme(scheme)
if err != nil {
log.WithError(err).Fatal("cannot register Kubernetes core schema - this should never happen")
}
}
// workspaceObjects contains all Kubernetes objects required to compute the status of a workspace
type workspaceObjects struct {
Pod *corev1.Pod `json:"pod"`
Events []corev1.Event `json:"events,omitempty"`
}
// GetOWI produces the owner, workspace, instance tripple that we use for tracing and logging
func (wso *workspaceObjects) GetOWI() logrus.Fields {
if wso.Pod != nil {
return wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)
}
return logrus.Fields{}
}
// IsWorkspaceHeadless returns true if the workspace described by these objects is headless
func (wso *workspaceObjects) IsWorkspaceHeadless() bool {
if wso.Pod != nil {
val, ok := wso.Pod.ObjectMeta.Labels[headlessLabel]
return ok && val == "true"
}
return false
}
func (wso *workspaceObjects) WorkspaceType() (api.WorkspaceType, error) {
var meta *metav1.ObjectMeta
if wso.Pod != nil {
meta = &wso.Pod.ObjectMeta
} else {
// we don't know anything about this pod - assume it's a regular pod
return api.WorkspaceType_REGULAR, xerrors.Errorf("cannot determine pod type")
}
lbl, ok := meta.Labels[wsk8s.TypeLabel]
if !ok {
tpe := api.WorkspaceType_REGULAR
log.WithFields(wsk8s.GetOWIFromObject(meta)).WithField("workspaceType", tpe).Info("determining type of legacy pod")
return tpe, nil
}
val, ok := api.WorkspaceType_value[strings.ToUpper(lbl)]
if !ok {
// pod has invalid type label - we assume it's a regular pod
return api.WorkspaceType_REGULAR, xerrors.Errorf("pod has invalid type label: %s", lbl)
}
return api.WorkspaceType(val), nil
}
// WorkspaceID returns the ID of the workspace
func (wso *workspaceObjects) WorkspaceID() (id string, ok bool) {
if wso.Pod != nil {
r, ok := wso.Pod.Annotations[workspaceIDAnnotation]
if ok {
return r, true
}
}
return "", false
}
func (wso *workspaceObjects) WasEverReady() (res bool) {
check := func(a map[string]string) bool {
_, neverReady := a[workspaceNeverReadyAnnotation]
return !neverReady
}
if wso.Pod != nil {
return check(wso.Pod.Annotations)
}
// We assume the pod was ready by default, even if we have nothing to show for it.
// The real world has shown that this produces the more favorable failure modes.
return true
}
func (wso *workspaceObjects) AlreadyInitializing() (res bool) {
check := func(a map[string]string) bool {
v, ok := a[alreadyInitializingAnnotation]
if !ok {
return false
}
return v == util.BooleanTrueString
}
if wso.Pod != nil {
return check(wso.Pod.Annotations)
}
return true
}
// HostName returns the name of the node this workspace is/was deployed to. If this workspace has never been deployed anywhere, HostName returns an empty string.
func (wso *workspaceObjects) NodeName() string {
if wso.Pod == nil {
return ""
}
if wso.Pod.Spec.NodeName != "" {
return wso.Pod.Spec.NodeName
}
if res, ok := wso.Pod.Annotations[nodeNameAnnotation]; ok {
return res
}
return ""
}
func (m *Manager) getWorkspaceObjects(ctx context.Context, pod *corev1.Pod) (*workspaceObjects, error) {
wso := &workspaceObjects{Pod: pod}
err := m.completeWorkspaceObjects(ctx, wso)
if err != nil {
return nil, xerrors.Errorf("getWorkspaceObjects: %w", err)
}
return wso, nil
}
// completeWorkspaceObjects finds the remaining Kubernetes objects based on the pod description
// or pod lifecycle indepedent state.
func (m *Manager) completeWorkspaceObjects(ctx context.Context, wso *workspaceObjects) error {
if wso.Pod == nil {
return xerrors.Errorf("completeWorkspaceObjects: need either pod or lifecycle independent state")
}
if wso.Events == nil {
events, err := m.RawClient.CoreV1().Events(m.Config.Namespace).Search(scheme, wso.Pod)
if err != nil {
return xerrors.Errorf("completeWorkspaceObjects: %w", err)
}
wso.Events = make([]corev1.Event, len(events.Items))
copy(wso.Events, events.Items)
}
return nil
}
func (m *Manager) getWorkspaceStatus(wso workspaceObjects) (*api.WorkspaceStatus, error) {
id, ok := wso.WorkspaceID()
if !ok {
return nil, xerrors.Errorf("cannot get %s annotation from %s", workspaceIDAnnotation, wso.Pod.Name)
}
var status *api.WorkspaceStatus
if wso.Pod == nil {
return nil, xerrors.Errorf("need pod to compute status")
}
// we have a workspace pod - use that to compute the status from scratch
workspaceContainer := getContainer(wso.Pod, "workspace")
if workspaceContainer == nil {
return nil, xerrors.Errorf("workspace pod for %s is degenerate - does not have workspace container", id)
}
wsurl, ok := wso.Pod.Annotations[kubernetes.WorkspaceURLAnnotation]
if !ok {
return nil, xerrors.Errorf("pod %s has no %s annotation", wso.Pod.Name, kubernetes.WorkspaceURLAnnotation)
}
tpe, err := wso.WorkspaceType()
if err != nil {
return nil, err
}
var timeout string
if t := m.Config.Timeouts.RegularWorkspace; t > 0 {
timeout = t.String()
}
if v, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
timeout = v
}
var (
wsImage = workspaceContainer.Image
ideImage string
supervisorImage string
ideImagesLayers []string
)
if ispec, ok := wso.Pod.Annotations[kubernetes.WorkspaceImageSpecAnnotation]; ok {
spec, err := regapi.ImageSpecFromBase64(ispec)
if err != nil {
return nil, xerrors.Errorf("invalid image spec: %w", err)
}
wsImage = spec.BaseRef
ideImage = spec.IdeRef
supervisorImage = spec.SupervisorRef
ideImagesLayers = spec.IdeLayerRef
}
ownerToken, ok := wso.Pod.Annotations[kubernetes.OwnerTokenAnnotation]
if !ok {
log.WithFields(wso.GetOWI()).Warn("pod has no owner token. is this a legacy pod?")
}
admission := api.AdmissionLevel_ADMIT_OWNER_ONLY
if av, ok := api.AdmissionLevel_value[strings.ToUpper(wso.Pod.Annotations[kubernetes.WorkspaceAdmissionAnnotation])]; ok {
admission = api.AdmissionLevel(av)
}
var volumeSnapshotStatus workspaceVolumeSnapshotStatus
if rawVolumeSnapshotStatus, ok := wso.Pod.Annotations[pvcWorkspaceVolumeSnapshotAnnotation]; ok {
err := json.Unmarshal([]byte(rawVolumeSnapshotStatus), &volumeSnapshotStatus)
if err != nil {
return nil, xerrors.Errorf("invalid volume snapshot status: %w", err)
}
}
status = &api.WorkspaceStatus{
Id: id,
StatusVersion: m.clock.Tick(),
Metadata: getWorkspaceMetadata(wso.Pod),
Spec: &api.WorkspaceSpec{
Headless: wso.IsWorkspaceHeadless(),
WorkspaceImage: wsImage,
DeprecatedIdeImage: ideImage,
IdeImage: &api.IDEImage{
WebRef: ideImage,
SupervisorRef: supervisorImage,
},
IdeImageLayers: ideImagesLayers,
Url: wsurl,
Type: tpe,
Timeout: timeout,
Class: wso.Pod.Labels[workspaceClassLabel],
},
Conditions: &api.WorkspaceConditions{
Snapshot: wso.Pod.Annotations[workspaceSnapshotAnnotation],
VolumeSnapshot: &api.VolumeSnapshotInfo{
VolumeSnapshotName: volumeSnapshotStatus.VolumeSnapshotName,
VolumeSnapshotHandle: volumeSnapshotStatus.VolumeSnapshotHandle,
},
},
Runtime: &api.WorkspaceRuntimeInfo{
NodeName: wso.Pod.Spec.NodeName,
PodName: wso.Pod.Name,
NodeIp: wso.Pod.Status.HostIP,
},
Auth: &api.WorkspaceAuthentication{
Admission: admission,
OwnerToken: ownerToken,
},
}
err = m.extractStatusFromPod(status, wso)
if err != nil {
return nil, xerrors.Errorf("cannot get workspace status: %w", err)
}
return status, nil
}
func getContainer(pod *corev1.Pod, name string) *corev1.Container {
for _, c := range pod.Spec.Containers {
if c.Name == name {
return &c
}
}
return nil
}
// getWorkspaceMetadata extracts a workspace's metadata from pod labels
func getWorkspaceMetadata(pod *corev1.Pod) *api.WorkspaceMetadata {
started := timestamppb.New(pod.CreationTimestamp.Time)
annotations := make(map[string]string)
for k, v := range pod.Annotations {
if !strings.HasPrefix(k, workspaceAnnotationPrefix) &&
!strings.HasPrefix(k, workspaceNeverReadyAnnotation) {
continue
}
annotations[strings.TrimPrefix(k, workspaceAnnotationPrefix)] = v
}
return &api.WorkspaceMetadata{
Owner: pod.ObjectMeta.Labels[wsk8s.OwnerLabel],
MetaId: pod.ObjectMeta.Labels[wsk8s.MetaIDLabel],
StartedAt: started,
Annotations: annotations,
}
}
func (m *Manager) extractStatusFromPod(result *api.WorkspaceStatus, wso workspaceObjects) error {
pod := wso.Pod
result.Spec.ExposedPorts = extractExposedPorts(pod).Ports
// check failure states, i.e. determine value of result.Failed
failure, phase := m.extractFailure(wso, m.metrics)
result.Conditions.Failed = failure
if phase != nil {
result.Phase = *phase
return nil
}
if reason, timedout := pod.Annotations[workspaceTimedOutAnnotation]; timedout {
if reason == "" {
reason = "workspace timed out for an unknown reason"
}
result.Conditions.Timeout = reason
}
if _, sbr := pod.Annotations[stoppedByRequestAnnotation]; sbr {
result.Conditions.StoppedByRequest = api.WorkspaceConditionBool_TRUE
}
if _, abr := pod.Annotations[abortRequestAnnotation]; abr {
result.Conditions.Aborted = api.WorkspaceConditionBool_TRUE
}
if wso.IsWorkspaceHeadless() {
for _, cs := range pod.Status.ContainerStatuses {
if cs.State.Terminated != nil && cs.State.Terminated.Message != "" {
result.Conditions.HeadlessTaskFailed = cs.State.Terminated.Message
break
}
}
}
// if ws-manager is still attempting to create workspace pod, keep status in pending
// pod might get deleted several times if we cannot schedule it on the node
if _, atc := pod.Annotations[attemptingToCreatePodAnnotation]; atc {
result.Phase = api.WorkspacePhase_PENDING
return nil
}
if isPodBeingDeleted(pod) {
result.Phase = api.WorkspacePhase_STOPPING
var hasFinalizer bool
for _, f := range wso.Pod.Finalizers {
if f == gitpodFinalizerName {
hasFinalizer = true
break
}
}
if !hasFinalizer {
// We do this independently of the dispostal status because pods only get their finalizer
// once they're running. If they fail before they reach the running phase we'll never see
// a disposal status, hence would never stop the workspace.
result.Phase = api.WorkspacePhase_STOPPED
}
if rawDisposalStatus, ok := pod.Annotations[disposalStatusAnnotation]; ok {
var ds workspaceDisposalStatus
err := json.Unmarshal([]byte(rawDisposalStatus), &ds)
if err != nil {
return err
}
if ds.Status.IsDisposed() {
if ds.BackupFailure == "" {
result.Conditions.FinalBackupComplete = api.WorkspaceConditionBool_TRUE
}
// Finalizer or not - once the backup is complete we consider the pod stopped.
// Once the finalizer is removed, there's no guarantee we see the pod again, because it might be
// deleted too quickly for us to handle. Hence, we consider the workspace stoppped once the backup
// is done, even though the finalizer might still be present.
//
// This runs the risk that a pod might still be present, but is considered stopped.
result.Phase = api.WorkspacePhase_STOPPED
}
result.Repo = ds.GitStatus
// if the final backup has failed we need to tell the world (if we haven't done so already)
if ds.BackupFailure != "" && !strings.Contains(result.Conditions.Failed, "last backup failed") {
if result.Conditions.Failed != "" {
result.Conditions.Failed += "; "
}
result.Conditions.Failed += fmt.Sprintf("last backup failed: %s.", ds.BackupFailure)
}
}
return nil
}
status := pod.Status
if status.Phase == corev1.PodPending {
// check if any container is still pulling images
for _, cs := range status.ContainerStatuses {
if cs.State.Waiting != nil {
if cs.State.Waiting.Reason != "ContainerCreating" && cs.State.Waiting.Reason != "PodInitializing" {
continue
}
if wso.WasEverReady() {
// Workspace was ready at some point but has become unready. This should never happen due to the
// very conservative readiness probe. If it does happen however, we don't want to fall back to a
// creating phase.
log.WithField("pod", pod).Warn("once ready pod became unready - this should not happen")
continue
}
result.Phase = api.WorkspacePhase_CREATING
result.Conditions.PullingImages = api.WorkspaceConditionBool_TRUE
result.Message = "containers are being created"
return nil
}
}
result.Phase = api.WorkspacePhase_PENDING
result.Message = "pod is pending"
return nil
} else if status.Phase == corev1.PodRunning {
if firstUserActivity, ok := wso.Pod.Annotations[firstUserActivityAnnotation]; ok {
t, err := time.Parse(time.RFC3339Nano, firstUserActivity)
if err != nil {
return xerrors.Errorf("cannot parse firstUserActivity: %w", err)
}
pt := timestamppb.New(t)
result.Conditions.FirstUserActivity = pt
}
for _, cs := range status.ContainerStatuses {
// containers that were terminated are not ready, but may have been
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerUnknownExitCode {
// the container was stopped for an unknown reason.
// this means that the workspace is currently interrupted.
result.Phase = api.WorkspacePhase_INTERRUPTED
// was it caused by a network outage?
if hasNetworkNotReadyEvent(wso) {
// this might be a false positive in case the workspace recovers but has an exit code 255 for
// another reason afterwards. As of now we're lagging the data to handle it better.
result.Conditions.NetworkNotReady = api.WorkspaceConditionBool_TRUE
result.Message = "container network not ready - workspace should recover shortly"
} else {
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
}
return nil
}
if cs.State.Terminated != nil && cs.State.Terminated.ExitCode == containerKilledExitCode {
// we have a container which was killed with exit code 137 recently and is (hopefully) restarting at the moment.
// this means that the workspace is currently interrupted.
result.Phase = api.WorkspacePhase_INTERRUPTED
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace should recover", cs.Name)
return nil
}
if !cs.Ready && cs.RestartCount > 0 {
// this container was running before, but is currently recovering from an interruption
result.Phase = api.WorkspacePhase_INTERRUPTED
result.Message = fmt.Sprintf("container %s was terminated unexpectedly - workspace is recovering", cs.Name)
return nil
}
}
tpe, err := wso.WorkspaceType()
if err != nil {
log.WithError(err).Warn("cannot determine workspace type - assuming this is a regular")
tpe = api.WorkspaceType_REGULAR
}
if wso.IsWorkspaceHeadless() && tpe != api.WorkspaceType_PREBUILD {
// headless workspaces (except prebuilds) don't expose a public service and thus cannot be asked about their status.
// once kubernetes reports the workspace running, so do we.
result.Phase = api.WorkspacePhase_RUNNING
return nil
}
// the pod phase of pending is sometimes slipped because it may not be PENDING at the stage of retrieving information on the workspace pod.
if !wso.AlreadyInitializing() {
result.Phase = api.WorkspacePhase_CREATING
result.Message = "pod is running but init has not started yet or is not finished"
return nil
}
if _, neverReady := pod.Annotations[workspaceNeverReadyAnnotation]; !neverReady {
// workspcae has been marked ready by a workspace-ready probe of the monitor
result.Phase = api.WorkspacePhase_RUNNING
return nil
}
// workspace has not yet been marked ready by one of monitor's probes. It must be initializing then.
result.Phase = api.WorkspacePhase_INITIALIZING
result.Message = "workspace initializer is running"
return nil
} else if isCompletedHeadless(&wso) {
result.Phase = api.WorkspacePhase_STOPPING
result.Message = "headless workspace is stopping"
return nil
} else if status.Phase == corev1.PodSucceeded {
result.Phase = api.WorkspacePhase_STOPPING
result.Message = "workspace is stopping"
return nil
} else if status.Phase == corev1.PodUnknown {
result.Phase = api.WorkspacePhase_UNKNOWN
result.Message = "Kubernetes reports workspace phase as unknown"
return nil
}
// If we've extracted a failure reason earlier, but no explicit phase at the time, as well as in this
// function, we resort to unknown.
// This is different to being unable to determine the workspace phase - the phase is unknown due to an unknown failure (by definition).
if failure != "" {
result.Phase = api.WorkspacePhase_UNKNOWN
return nil
}
log.WithFields(wsk8s.GetOWIFromObject(&pod.ObjectMeta)).WithField("pod", pod).Error("cannot determine workspace phase")
result.Phase = api.WorkspacePhase_UNKNOWN
result.Message = "cannot determine workspace phase. We should never get here."
return nil
}
// extractFailure returns a pod failure reason and possibly a phase. If phase is nil then
// one should extract the phase themselves. If the pod has not failed, this function returns "", nil.
func (m *Manager) extractFailure(wso workspaceObjects, metrics *metrics) (string, *api.WorkspacePhase) {
log := log.WithFields(wso.GetOWI())
pod := wso.Pod
wsType := strings.ToUpper(pod.Labels[wsk8s.TypeLabel])
wsClass := pod.Labels[workspaceClassLabel]
// if the workspace was explicitely marked as failed that also constitutes a failure reason
reason, explicitFailure := pod.Annotations[workspaceExplicitFailAnnotation]
if explicitFailure {
return reason, nil
}
status := pod.Status
if status.Phase == corev1.PodFailed && (status.Reason != "" || status.Message != "") {
// Don't force the phase to UNKNONWN here to leave a chance that we may detect the actual phase of
// the workspace, e.g. stopping.
return fmt.Sprintf("%s: %s", status.Reason, status.Message), nil
}
for _, cs := range status.ContainerStatuses {
if cs.State.Waiting != nil {
if cs.State.Waiting.Reason == "ImagePullBackOff" || cs.State.Waiting.Reason == "ErrImagePull" {
// If the image pull failed we were definitely in the api.WorkspacePhase_CREATING phase,
// unless of course this pod has been deleted already.
var res *api.WorkspacePhase
if isPodBeingDeleted(pod) {
// The pod is being deleted already and we have to decide the phase based on the presence of the
// finalizer and disposal status annotation. That code already exists in the remainder of getStatus,
// hence we defer the decision.
res = nil
} else {
c := api.WorkspacePhase_CREATING
res = &c
}
return fmt.Sprintf("cannot pull image: %s", cs.State.Waiting.Message), res
}
}
terminationState := cs.State.Terminated
if terminationState == nil {
terminationState = cs.LastTerminationState.Terminated
}
if terminationState != nil {
// a workspace terminated container is not neccesarily bad. During shutdown workspaces containers
// can go in this state and that's ok. However, if the workspace was shutting down due to deletion,
// we would not be here as we've checked for a DeletionTimestamp prior. So let's find out why the
// container is terminating.
if terminationState.ExitCode != 0 && terminationState.Message != "" {
var phase *api.WorkspacePhase
if !isPodBeingDeleted(pod) {
// If the wrote a termination message and is not currently being deleted,
// then it must have been/be running. If we did not force the phase here,
// we'd be in unknown.
c := api.WorkspacePhase_RUNNING
phase = &c
}
if terminationState.ExitCode == containerKilledExitCode && terminationState.Reason == "ContainerStatusUnknown" {
// For some reason, the pod is killed with unknown container status and no taints on the underlying node.
// Therefore, we skip extracting the failure from the terminated message.
// ref: https://github.com/gitpod-io/gitpod/issues/12021
if m.Clientset != nil {
var node corev1.Node
if err := m.Clientset.Get(context.TODO(), types.NamespacedName{Namespace: "", Name: wso.NodeName()}, &node); err == nil && len(node.Spec.Taints) == 0 {
return "", nil
}
}
}
// the container itself told us why it was terminated - use that as failure reason
return extractFailureFromLogs([]byte(terminationState.Message)), phase
} else if terminationState.Reason == "Error" {
if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerKilledExitCode {
phase := api.WorkspacePhase_RUNNING
return fmt.Sprintf("container %s ran with an error: exit code %d", cs.Name, terminationState.ExitCode), &phase
}
} else if terminationState.Reason == "Completed" {
// container terminated successfully - this is not a failure
if !isPodBeingDeleted(pod) {
if metrics != nil && !wso.IsWorkspaceHeadless() {
metrics.totalUnintentionalWorkspaceStopCounterVec.WithLabelValues(wsType, wsClass).Inc()
}
}
return "", nil
} else if !isPodBeingDeleted(pod) && terminationState.ExitCode != containerUnknownExitCode {
// if a container is terminated and it wasn't because of either:
// - regular shutdown
// - the exit code "UNKNOWN" (which might be caused by an intermittent issue and is handled in extractStatusFromPod)
// - another known error
// then we report it as UNKNOWN
res := api.WorkspacePhase_UNKNOWN
return fmt.Sprintf("workspace container %s terminated for an unknown reason: (%s) %s", cs.Name, terminationState.Reason, terminationState.Message), &res
}
}
}
// some failure conditions are only evident from the "events" that kubernetes stores, e.g. events coming from the kubelet
for _, evt := range wso.Events {
if evt.Reason != "FailedMount" {
continue
}
if strings.Contains(evt.Message, "MountVolume.MountDevice failed for volume") {
// ref: https://github.com/gitpod-io/gitpod/issues/13353
// ref: https://github.com/kubernetes-sigs/gcp-compute-persistent-disk-csi-driver/issues/608
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
if metrics != nil {
metrics.totalMountDeviceFailedVec.WithLabelValues(wsType, wsClass).Inc()
}
return "", nil
} else if strings.Contains(evt.Message, workspaceVolumeName) {
// ref: https://github.com/gitpod-io/gitpod/issues/14032
log.WithField("pod", pod.Name).Warnf("%s", evt.Message)
if metrics != nil {
metrics.totalCannotMountVolumeVec.WithLabelValues(wsType, wsClass).Inc()
}
return "", nil
} else {
// if this happens we did not do a good job because that means we've introduced another volume to the pod
// but did not consider that mounting it might fail.
return evt.Message, nil
}
}
return "", nil
}
// extractFailureFromLogs attempts to extract the last error message from a workspace
// container's log output.
func extractFailureFromLogs(logs []byte) string {
var sep = []byte("\n")
var msg struct {
Error string `json:"error"`
Message string `json:"message"`
}
var nidx int
for idx := bytes.LastIndex(logs, sep); idx > 0; idx = nidx {
nidx = bytes.LastIndex(logs[:idx], sep)
if nidx < 0 {
nidx = 0
}
line := logs[nidx:idx]
err := json.Unmarshal(line, &msg)
if err != nil {
continue
}
if msg.Message == "" {
continue
}
if msg.Error == "" {
return msg.Message
}
return msg.Message + ": " + msg.Error
}
return string(logs)
}
// isPodBeingDeleted returns true if the pod is currently being deleted
func isPodBeingDeleted(pod *corev1.Pod) bool {
// if the pod is being deleted the only marker we have is that the deletionTimestamp is set
return pod.ObjectMeta.DeletionTimestamp != nil
}
// isCompletedHeadless returns true if the pod is a headless workspace and either succeeded or failed (e.g., ran to completion)
func isCompletedHeadless(wso *workspaceObjects) bool {
return wso.IsWorkspaceHeadless() && (wso.Pod.Status.Phase == corev1.PodSucceeded || wso.Pod.Status.Phase == corev1.PodFailed)
}
type activity string
const (
activityInit activity = "initialization"
activityStartup activity = "startup"
activityCreatingContainers activity = "creating containers"
activityPullingImages activity = "pulling images"
activityRunningHeadless activity = "running the headless workspace"
activityNone activity = "period of inactivity"
activityMaxLifetime activity = "maximum lifetime"
activityClosed activity = "after being closed"
activityInterrupted activity = "workspace interruption"
activityStopping activity = "stopping"
activityBackup activity = "backup"
)
// isWorkspaceTimedOut determines if a workspace is timed out based on the manager configuration and state the pod is in.
// This function does NOT use the workspaceTimedoutAnnotation, but rather is used to set that annotation in the first place.
func (m *Manager) isWorkspaceTimedOut(wso workspaceObjects) (reason string, err error) {
workspaceID, ok := wso.WorkspaceID()
if !ok {
return "", xerrors.Errorf("workspace has no %s annotation", workspaceIDAnnotation)
}
status, err := m.getWorkspaceStatus(wso)
if err != nil {
return "", xerrors.Errorf("cannot determine workspace phase: %w", err)
}
phase := status.Phase
decide := func(start time.Time, timeout util.Duration, activity activity) (string, error) {
td := time.Duration(timeout)
inactivity := time.Since(start)
if inactivity < td {
return "", nil
}
return fmt.Sprintf("workspace timed out after %s (%s) took longer than %s", activity, formatDuration(inactivity), formatDuration(td)), nil
}
start := wso.Pod.ObjectMeta.CreationTimestamp.Time
lastActivity := m.getWorkspaceActivity(workspaceID)
_, isClosed := wso.Pod.Annotations[workspaceClosedAnnotation]
switch phase {
case api.WorkspacePhase_PENDING:
return decide(start, m.Config.Timeouts.Initialization, activityInit)
case api.WorkspacePhase_INITIALIZING:
return decide(start, m.Config.Timeouts.TotalStartup, activityStartup)
case api.WorkspacePhase_CREATING:
activity := activityCreatingContainers
if status.Conditions.PullingImages == api.WorkspaceConditionBool_TRUE {
activity = activityPullingImages
}
return decide(start, m.Config.Timeouts.TotalStartup, activity)
case api.WorkspacePhase_RUNNING:
// First check is always for the max lifetime
if msg, err := decide(start, m.Config.Timeouts.MaxLifetime, activityMaxLifetime); msg != "" {
return msg, err
}
timeout := m.Config.Timeouts.RegularWorkspace
activity := activityNone
if wso.IsWorkspaceHeadless() {
timeout = m.Config.Timeouts.HeadlessWorkspace
lastActivity = &start
activity = activityRunningHeadless
} else if lastActivity == nil {
// the workspace is up and running, but the user has never produced any activity
return decide(start, m.Config.Timeouts.TotalStartup, activityNone)
} else if isClosed {
return decide(*lastActivity, m.Config.Timeouts.AfterClose, activityClosed)
}
if ctv, ok := wso.Pod.Annotations[customTimeoutAnnotation]; ok {
if ct, err := time.ParseDuration(ctv); err == nil {
timeout = util.Duration(ct)
} else {
log.WithError(err).WithField("customTimeout", ctv).WithFields(wsk8s.GetOWIFromObject(&wso.Pod.ObjectMeta)).Warn("pod had custom timeout annotation set, but could not parse its value. Defaulting to ws-manager config.")
}
}
return decide(*lastActivity, timeout, activity)
case api.WorkspacePhase_INTERRUPTED:
if lastActivity == nil {
// the workspace is up and running, but the user has never produced any activity
return decide(start, m.Config.Timeouts.Interrupted, activityInterrupted)
}
return decide(*lastActivity, m.Config.Timeouts.Interrupted, activityInterrupted)
case api.WorkspacePhase_STOPPING:
if isPodBeingDeleted(wso.Pod) && status.Conditions.FinalBackupComplete != api.WorkspaceConditionBool_TRUE {
// Beware: we apply the ContentFinalization timeout only to workspaces which are currently being deleted.
// We basically don't expect a workspace to be in content finalization before it's been deleted.
return decide(wso.Pod.DeletionTimestamp.Time, m.Config.Timeouts.ContentFinalization, activityBackup)
} else if !isPodBeingDeleted(wso.Pod) {
// pods that have not been deleted have never timed out
return "", nil
} else {
return decide(wso.Pod.DeletionTimestamp.Time, m.Config.Timeouts.Stopping, activityStopping)
}
default:
// the only other phases we can be in is stopped which is pointless to time out
return "", nil
}
}
// hasNetworkNotReadyEvent determines if a workspace experienced a network outage - now, or any time in the past - based on
// its kubernetes events
func hasNetworkNotReadyEvent(wso workspaceObjects) bool {
for _, evt := range wso.Events {
if evt.Reason == "NetworkNotReady" {
return true
}
}
return false
}
func formatDuration(d time.Duration) string {
d = d.Round(time.Minute)
h := d / time.Hour
d -= h * time.Hour
m := d / time.Minute
return fmt.Sprintf("%02dh%02dm", h, m)
}