@@ -15,6 +15,7 @@ import (
1515 "github.com/google/go-cmp/cmp"
1616 "github.com/google/go-cmp/cmp/cmpopts"
1717 "github.com/google/uuid"
18+ prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819
1920 "k8s.io/apimachinery/pkg/api/equality"
2021 "k8s.io/apimachinery/pkg/api/meta"
@@ -145,6 +146,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
145146 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
146147 optrAvailableUpdates .AcceptRisks = acceptRisks
147148 optrAvailableUpdates .ConditionRegistry = optr .conditionRegistry
149+ optrAvailableUpdates .AlertGetter = optr .AlertGetter
148150 optrAvailableUpdates .Condition = condition
149151
150152 responseFailed := (condition .Type == configv1 .RetrievedUpdates &&
@@ -159,6 +161,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
159161 }
160162 }
161163
164+ if optrAvailableUpdates .ShouldReconcileAcceptRisks () {
165+ if err := optrAvailableUpdates .evaluateAlertRisks (ctx ); err != nil {
166+ klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
167+ }
168+ }
162169 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
163170
164171 queueKey := optr .queueKey ()
@@ -212,6 +219,11 @@ type availableUpdates struct {
212219
213220 // RiskConditions stores the condition for every risk (name, url, message, matchingRules).
214221 RiskConditions map [string ][]metav1.Condition
222+
223+ AlertGetter AlertGetter
224+
225+ // AlertRisks stores the condition for every alert that might have impact on cluster update
226+ AlertRisks []configv1.ConditionalUpdateRisk
215227}
216228
217229func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -317,6 +329,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
317329 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
318330 AcceptRisks : optr .availableUpdates .AcceptRisks ,
319331 RiskConditions : optr .availableUpdates .RiskConditions ,
332+ AlertRisks : optr .availableUpdates .AlertRisks ,
333+ AlertGetter : optr .availableUpdates .AlertGetter ,
320334 LastAttempt : optr .availableUpdates .LastAttempt ,
321335 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
322336 Current : * optr .availableUpdates .Current .DeepCopy (),
@@ -513,6 +527,166 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
513527 }
514528}
515529
530+ type AlertGetter interface {
531+ Get (ctx context.Context ) (prometheusv1.AlertsResult , error )
532+ }
533+
534+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) error {
535+ if u == nil || u .AlertGetter == nil {
536+ u .AlertRisks = nil
537+ return nil
538+ }
539+ alertsResult , err := u .AlertGetter .Get (ctx )
540+ if err != nil {
541+ return fmt .Errorf ("failed to get alerts: %w" , err )
542+ }
543+
544+ u .AlertRisks = alertsToRisks (alertsResult .Alerts )
545+ return nil
546+ }
547+
548+ func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
549+ klog .V (2 ).Infof ("Found %d alerts" , len (alerts ))
550+ risks := map [string ]configv1.ConditionalUpdateRisk {}
551+ for _ , alert := range alerts {
552+ var alertName string
553+ if alertName = string (alert .Labels ["alertname" ]); alertName == "" {
554+ continue
555+ }
556+ if alert .State == "pending" {
557+ continue
558+ }
559+
560+ var summary string
561+ if summary = string (alert .Annotations ["summary" ]); summary == "" {
562+ summary = alertName
563+ }
564+ if ! strings .HasSuffix (summary , "." ) {
565+ summary += "."
566+ }
567+
568+ var description string
569+ alertMessage := string (alert .Annotations ["message" ])
570+ alertDescription := string (alert .Annotations ["description" ])
571+ switch {
572+ case alertMessage != "" && alertDescription != "" :
573+ description += " The alert description is: " + alertDescription + " | " + alertMessage
574+ case alertDescription != "" :
575+ description += " The alert description is: " + alertDescription
576+ case alertMessage != "" :
577+ description += " The alert description is: " + alertMessage
578+ default :
579+ description += " The alert has no description."
580+ }
581+
582+ var runbook string
583+ if runbook = string (alert .Annotations ["runbook" ]); runbook == "" {
584+ runbook = "<alert does not have a runbook_url annotation>"
585+ }
586+
587+ details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
588+
589+ // TODO (hongkliu):
590+ alertURL := "https://console.com/monitoring/alertrules/id"
591+ alertPromQL := "todo-expression"
592+
593+ severity := string (alert .Labels ["severity" ])
594+ if severity == "critical" {
595+ if alertName == "PodDisruptionBudgetLimit" {
596+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
597+ }
598+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
599+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
600+ Status : metav1 .ConditionTrue ,
601+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
602+ Message : fmt .Sprintf ("%s alert %s %s, suggesting significant cluster issues worth investigating. %s" , severity , alertName , alert .State , details ),
603+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
604+ })
605+ continue
606+ }
607+
608+ if alertName == "PodDisruptionBudgetAtLimit" {
609+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
610+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
611+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
612+ Status : metav1 .ConditionTrue ,
613+ Reason : internal .AlertConditionReason (string (alert .State )),
614+ Message : fmt .Sprintf ("%s alert %s %s, which might slow node drains. %s" , severity , alertName , alert .State , details ),
615+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
616+ })
617+ continue
618+ }
619+
620+ if alertName == "KubeContainerWaiting" {
621+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
622+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
623+ Status : metav1 .ConditionTrue ,
624+ Reason : internal .AlertConditionReason (string (alert .State )),
625+ Message : fmt .Sprintf ("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s" , severity , alertName , alert .State , details ),
626+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
627+ })
628+ continue
629+ }
630+
631+ if alertName == "KubeNodeNotReady" || alertName == "KubeNodeReadinessFlapping" || alertName == "KubeNodeUnreachable" {
632+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
633+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
634+ Status : metav1 .ConditionTrue ,
635+ Reason : internal .AlertConditionReason (string (alert .State )),
636+ Message : fmt .Sprintf ("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s" , severity , alertName , alert .State , details ),
637+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
638+ })
639+ continue
640+ }
641+ }
642+
643+ klog .V (2 ).Infof ("Got %d risks" , len (risks ))
644+ if len (risks ) == 0 {
645+ return nil
646+ }
647+
648+ var ret []configv1.ConditionalUpdateRisk
649+ var keys []string
650+ for k := range risks {
651+ keys = append (keys , k )
652+ }
653+ sort .Strings (keys )
654+ for _ , k := range keys {
655+ ret = append (ret , risks [k ])
656+ }
657+ return ret
658+ }
659+
660+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url , promQL string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
661+ risk , ok := risks [riskName ]
662+ if ! ok {
663+ return configv1.ConditionalUpdateRisk {
664+ Name : riskName ,
665+ Message : message ,
666+ URL : url ,
667+ MatchingRules : []configv1.ClusterCondition {
668+ {
669+ Type : "PromQL" ,
670+ PromQL : & configv1.PromQLClusterCondition {
671+ PromQL : promQL ,
672+ },
673+ },
674+ },
675+ Conditions : []metav1.Condition {condition },
676+ }
677+ }
678+
679+ if c := meta .FindStatusCondition (risk .Conditions , condition .Type ); c != nil {
680+ c .Message = fmt .Sprintf ("%s; %s" , c .Message , condition .Message )
681+ if c .LastTransitionTime .After (condition .LastTransitionTime .Time ) {
682+ c .LastTransitionTime = condition .LastTransitionTime
683+ }
684+ meta .SetStatusCondition (& risk .Conditions , * c )
685+ }
686+
687+ return risk
688+ }
689+
516690func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
517691 if u == nil {
518692 return
@@ -522,7 +696,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
522696 risks := risksInOrder (riskVersions )
523697 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
524698
525- if err := sanityCheck (u .ConditionalUpdates ); err != nil {
699+ if err := sanityCheck (u .ConditionalUpdates , u . AlertRisks ); err != nil {
526700 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
527701 }
528702 for i , conditionalUpdate := range u .ConditionalUpdates {
@@ -540,7 +714,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
540714 }
541715}
542716
543- func sanityCheck (updates []configv1.ConditionalUpdate ) error {
717+ func sanityCheck (updates []configv1.ConditionalUpdate , alertRisks []configv1. ConditionalUpdateRisk ) error {
544718 risks := map [string ]configv1.ConditionalUpdateRisk {}
545719 var errs []error
546720 for _ , update := range updates {
@@ -556,6 +730,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
556730 }
557731 }
558732 }
733+ for _ , alertRisk := range alertRisks {
734+ if _ , ok := risks [alertRisk .Name ]; ok {
735+ errs = append (errs , fmt .Errorf ("found alert risk and conditional update risk share the name: %s" , alertRisk .Name ))
736+ }
737+ }
559738 return utilerrors .NewAggregate (errs )
560739}
561740
@@ -600,6 +779,7 @@ const (
600779 recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
601780 recommendedReasonEvaluationFailed = "EvaluationFailed"
602781 recommendedReasonMultiple = "MultipleReasons"
782+ //recommendedReasonAlertFiring = "AlertFiring"
603783
604784 // recommendedReasonExposed is used instead of the original name if it does
605785 // not match the pattern for a valid k8s condition reason.
@@ -617,10 +797,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
617797func newRecommendedReason (now , want string ) string {
618798 switch {
619799 case now == recommendedReasonRisksNotExposed ||
620- now == recommendedReasonAllExposedRisksAccepted ||
800+ now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
801+ now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
621802 now == want :
622803 return want
623- case want == recommendedReasonRisksNotExposed :
804+ case want == recommendedReasonRisksNotExposed ||
805+ (now == recommendedReasonEvaluationFailed ) && want == recommendedReasonAllExposedRisksAccepted ||
806+ now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed ):
624807 return now
625808 default :
626809 return recommendedReasonMultiple
@@ -677,7 +860,6 @@ func evaluateConditionalUpdate(
677860 if len (errorMessages ) > 0 {
678861 recommended .Message = strings .Join (errorMessages , "\n \n " )
679862 }
680-
681863 return recommended
682864}
683865
0 commit comments