@@ -15,6 +15,7 @@ import (
1515 "github.com/google/go-cmp/cmp"
1616 "github.com/google/go-cmp/cmp/cmpopts"
1717 "github.com/google/uuid"
18+ prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819
1920 "k8s.io/apimachinery/pkg/api/equality"
2021 "k8s.io/apimachinery/pkg/api/meta"
@@ -145,6 +146,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
145146 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
146147 optrAvailableUpdates .AcceptRisks = acceptRisks
147148 optrAvailableUpdates .ConditionRegistry = optr .conditionRegistry
149+ optrAvailableUpdates .AlertGetter = optr .AlertGetter
148150 optrAvailableUpdates .Condition = condition
149151
150152 responseFailed := (condition .Type == configv1 .RetrievedUpdates &&
@@ -159,6 +161,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
159161 }
160162 }
161163
164+ if optrAvailableUpdates .ShouldReconcileAcceptRisks () {
165+ if err := optrAvailableUpdates .evaluateAlertRisks (ctx ); err != nil {
166+ klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
167+ }
168+ }
162169 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
163170
164171 queueKey := optr .queueKey ()
@@ -212,6 +219,11 @@ type availableUpdates struct {
212219
213220 // RiskConditions stores the condition for every risk (name, url, message, matchingRules).
214221 RiskConditions map [string ][]metav1.Condition
222+
223+ AlertGetter AlertGetter
224+
225+ // AlertRisks stores the condition for every alert that might have impact on cluster update
226+ AlertRisks []configv1.ConditionalUpdateRisk
215227}
216228
217229func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -317,6 +329,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
317329 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
318330 AcceptRisks : optr .availableUpdates .AcceptRisks ,
319331 RiskConditions : optr .availableUpdates .RiskConditions ,
332+ AlertRisks : optr .availableUpdates .AlertRisks ,
333+ AlertGetter : optr .availableUpdates .AlertGetter ,
320334 LastAttempt : optr .availableUpdates .LastAttempt ,
321335 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
322336 Current : * optr .availableUpdates .Current .DeepCopy (),
@@ -509,6 +523,170 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
509523 }
510524}
511525
526+ type AlertGetter interface {
527+ Get (ctx context.Context ) (prometheusv1.AlertsResult , error )
528+ }
529+
530+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) error {
531+ if u == nil || u .AlertGetter == nil {
532+ u .AlertRisks = nil
533+ return nil
534+ }
535+ alertsResult , err := u .AlertGetter .Get (ctx )
536+ if err != nil {
537+ return fmt .Errorf ("failed to get alerts: %w" , err )
538+ }
539+
540+ u .AlertRisks = alertsToRisks (alertsResult .Alerts )
541+ return nil
542+ }
543+
544+ func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
545+ klog .V (2 ).Infof ("Found %d alerts" , len (alerts ))
546+ risks := map [string ]configv1.ConditionalUpdateRisk {}
547+ for _ , alert := range alerts {
548+ var alertName string
549+ if alertName = string (alert .Labels ["alertname" ]); alertName == "" {
550+ continue
551+ }
552+ if alert .State == "pending" {
553+ continue
554+ }
555+
556+ var summary string
557+ if summary = string (alert .Annotations ["summary" ]); summary == "" {
558+ summary = alertName
559+ }
560+ if ! strings .HasSuffix (summary , "." ) {
561+ summary += "."
562+ }
563+
564+ var description string
565+ alertMessage := string (alert .Annotations ["message" ])
566+ alertDescription := string (alert .Annotations ["description" ])
567+ switch {
568+ case alertMessage != "" && alertDescription != "" :
569+ description += " The alert description is: " + alertDescription + " | " + alertMessage
570+ case alertDescription != "" :
571+ description += " The alert description is: " + alertDescription
572+ case alertMessage != "" :
573+ description += " The alert description is: " + alertMessage
574+ default :
575+ description += " The alert has no description."
576+ }
577+
578+ var runbook string
579+ if runbook = string (alert .Annotations ["runbook" ]); runbook == "" {
580+ runbook = "<alert does not have a runbook_url annotation>"
581+ }
582+
583+ details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
584+
585+ // TODO (hongkliu):
586+ alertURL := "https://console.com/monitoring/alertrules/id"
587+ alertPromQL := "todo-expression"
588+
589+ severity := string (alert .Labels ["severity" ])
590+ if severity == "critical" {
591+ if alertName == internal .AlertNamePodDisruptionBudgetLimit {
592+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
593+ }
594+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
595+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
596+ Status : metav1 .ConditionTrue ,
597+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
598+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting significant cluster issues worth investigating" , details ),
599+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
600+ })
601+ continue
602+ }
603+
604+ if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
605+ details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
606+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
607+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
608+ Status : metav1 .ConditionTrue ,
609+ Reason : internal .AlertConditionReason (string (alert .State )),
610+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which might slow node drains" , details ),
611+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
612+ })
613+ continue
614+ }
615+
616+ if internal .HavePullWaiting .Has (alertName ) ||
617+ internal .HaveNodes .Has (alertName ) ||
618+ alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
619+ alertName == internal .AlertNameVMCannotBeEvicted {
620+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
621+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
622+ Status : metav1 .ConditionTrue ,
623+ Reason : internal .AlertConditionReason (string (alert .State )),
624+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "which may slow workload redistribution during rolling node updates" , details ),
625+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
626+ })
627+ continue
628+ }
629+
630+ updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
631+ if updatePrecheck == "true" {
632+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
633+ Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
634+ Status : metav1 .ConditionTrue ,
635+ Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
636+ Message : internal .AlertConditionMessage (alertName , severity , string (alert .State ), "suggesting issues worth investigating before updating the cluster" , details ),
637+ LastTransitionTime : metav1 .NewTime (alert .ActiveAt ),
638+ })
639+ continue
640+ }
641+ }
642+
643+ klog .V (2 ).Infof ("Got %d risks" , len (risks ))
644+ if len (risks ) == 0 {
645+ return nil
646+ }
647+
648+ var ret []configv1.ConditionalUpdateRisk
649+ var keys []string
650+ for k := range risks {
651+ keys = append (keys , k )
652+ }
653+ sort .Strings (keys )
654+ for _ , k := range keys {
655+ ret = append (ret , risks [k ])
656+ }
657+ return ret
658+ }
659+
660+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url , promQL string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
661+ risk , ok := risks [riskName ]
662+ if ! ok {
663+ return configv1.ConditionalUpdateRisk {
664+ Name : riskName ,
665+ Message : message ,
666+ URL : url ,
667+ MatchingRules : []configv1.ClusterCondition {
668+ {
669+ Type : "PromQL" ,
670+ PromQL : & configv1.PromQLClusterCondition {
671+ PromQL : promQL ,
672+ },
673+ },
674+ },
675+ Conditions : []metav1.Condition {condition },
676+ }
677+ }
678+
679+ if c := meta .FindStatusCondition (risk .Conditions , condition .Type ); c != nil {
680+ c .Message = fmt .Sprintf ("%s; %s" , c .Message , condition .Message )
681+ if c .LastTransitionTime .After (condition .LastTransitionTime .Time ) {
682+ c .LastTransitionTime = condition .LastTransitionTime
683+ }
684+ meta .SetStatusCondition (& risk .Conditions , * c )
685+ }
686+
687+ return risk
688+ }
689+
512690func (u * availableUpdates ) evaluateConditionalUpdates (ctx context.Context ) {
513691 if u == nil {
514692 return
@@ -518,7 +696,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
518696 risks := risksInOrder (riskVersions )
519697 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
520698
521- if err := sanityCheck (u .ConditionalUpdates ); err != nil {
699+ if err := sanityCheck (u .ConditionalUpdates , u . AlertRisks ); err != nil {
522700 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
523701 }
524702 for i , conditionalUpdate := range u .ConditionalUpdates {
@@ -536,7 +714,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
536714 }
537715}
538716
539- func sanityCheck (updates []configv1.ConditionalUpdate ) error {
717+ func sanityCheck (updates []configv1.ConditionalUpdate , alertRisks []configv1. ConditionalUpdateRisk ) error {
540718 risks := map [string ]configv1.ConditionalUpdateRisk {}
541719 var errs []error
542720 for _ , update := range updates {
@@ -552,6 +730,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
552730 }
553731 }
554732 }
733+ for _ , alertRisk := range alertRisks {
734+ if _ , ok := risks [alertRisk .Name ]; ok {
735+ errs = append (errs , fmt .Errorf ("found alert risk and conditional update risk share the name: %s" , alertRisk .Name ))
736+ }
737+ }
555738 return utilerrors .NewAggregate (errs )
556739}
557740
@@ -596,6 +779,7 @@ const (
596779 recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
597780 recommendedReasonEvaluationFailed = "EvaluationFailed"
598781 recommendedReasonMultiple = "MultipleReasons"
782+ //recommendedReasonAlertFiring = "AlertFiring"
599783
600784 // recommendedReasonExposed is used instead of the original name if it does
601785 // not match the pattern for a valid k8s condition reason.
@@ -613,10 +797,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
613797func newRecommendedReason (now , want string ) string {
614798 switch {
615799 case now == recommendedReasonRisksNotExposed ||
616- now == recommendedReasonAllExposedRisksAccepted ||
800+ now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
801+ now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
617802 now == want :
618803 return want
619- case want == recommendedReasonRisksNotExposed :
804+ case want == recommendedReasonRisksNotExposed ||
805+ (now == recommendedReasonEvaluationFailed ) && want == recommendedReasonAllExposedRisksAccepted ||
806+ now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed ):
620807 return now
621808 default :
622809 return recommendedReasonMultiple
@@ -673,7 +860,6 @@ func evaluateConditionalUpdate(
673860 if len (errorMessages ) > 0 {
674861 recommended .Message = strings .Join (errorMessages , "\n \n " )
675862 }
676-
677863 return recommended
678864}
679865
0 commit comments