@@ -103,6 +103,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
103103 if ! acceptRisks .Equal (optrAvailableUpdates .AcceptRisks ) {
104104 needsConditionalUpdateEval = true
105105 }
106+ // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
106107 if ! needsConditionalUpdateEval {
107108 klog .V (2 ).Infof ("Available updates were recently retrieved, with less than %s elapsed since %s, will try later." , optr .minimumUpdateCheckInterval , optrAvailableUpdates .LastAttempt .Format (time .RFC3339 ))
108109 return nil
@@ -164,11 +165,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
164165 optrAvailableUpdates .AcceptRisks = acceptRisks
165166 optrAvailableUpdates .ShouldReconcileAcceptRisks = optr .shouldReconcileAcceptRisks
166167 optrAvailableUpdates .AlertGetter = optr .AlertGetter
167- if optrAvailableUpdates .ShouldReconcileAcceptRisks () {
168- if err := optrAvailableUpdates .evaluateAlertRisks (ctx ); err != nil {
169- klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
170- }
171- }
172168 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
173169
174170 queueKey := optr .queueKey ()
@@ -224,9 +220,6 @@ type availableUpdates struct {
224220 RiskConditions map [string ][]metav1.Condition
225221
226222 AlertGetter AlertGetter
227-
228- // AlertRisks stores the condition for every alert that might have impact on cluster update
229- AlertRisks []configv1.ConditionalUpdateRisk
230223}
231224
232225func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -332,7 +325,6 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
332325 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
333326 AcceptRisks : optr .availableUpdates .AcceptRisks ,
334327 RiskConditions : optr .availableUpdates .RiskConditions ,
335- AlertRisks : optr .availableUpdates .AlertRisks ,
336328 AlertGetter : optr .availableUpdates .AlertGetter ,
337329 LastAttempt : optr .availableUpdates .LastAttempt ,
338330 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
@@ -530,18 +522,15 @@ type AlertGetter interface {
530522 Get (ctx context.Context ) (prometheusv1.AlertsResult , error )
531523}
532524
533- func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) error {
525+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) ([]configv1. ConditionalUpdateRisk , error ) {
534526 if u == nil || u .AlertGetter == nil {
535- u .AlertRisks = nil
536- return nil
527+ return nil , nil
537528 }
538529 alertsResult , err := u .AlertGetter .Get (ctx )
539530 if err != nil {
540- return fmt .Errorf ("failed to get alerts: %w" , err )
531+ return nil , fmt .Errorf ("failed to get alerts: %w" , err )
541532 }
542-
543- u .AlertRisks = alertsToRisks (alertsResult .Alerts )
544- return nil
533+ return alertsToRisks (alertsResult .Alerts ), nil
545534}
546535
547536func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
@@ -579,22 +568,21 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
579568 }
580569
581570 var runbook string
571+ alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
582572 if runbook = string (alert .Annotations ["runbook" ]); runbook == "" {
583573 runbook = "<alert does not have a runbook_url annotation>"
574+ } else {
575+ alertURL = runbook
584576 }
585577
586578 details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
587579
588- // TODO (hongkliu):
589- alertURL := "https://console.com/monitoring/alertrules/id"
590- alertPromQL := "todo-expression"
591-
592580 severity := string (alert .Labels ["severity" ])
593581 if severity == "critical" {
594582 if alertName == internal .AlertNamePodDisruptionBudgetLimit {
595583 details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
596584 }
597- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
585+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
598586 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
599587 Status : metav1 .ConditionTrue ,
600588 Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
@@ -606,7 +594,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
606594
607595 if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
608596 details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
609- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
597+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
610598 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
611599 Status : metav1 .ConditionTrue ,
612600 Reason : internal .AlertConditionReason (string (alert .State )),
@@ -620,7 +608,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
620608 internal .HaveNodes .Has (alertName ) ||
621609 alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
622610 alertName == internal .AlertNameVMCannotBeEvicted {
623- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
611+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
624612 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
625613 Status : metav1 .ConditionTrue ,
626614 Reason : internal .AlertConditionReason (string (alert .State )),
@@ -632,7 +620,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
632620
633621 updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
634622 if updatePrecheck == "true" {
635- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
623+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
636624 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
637625 Status : metav1 .ConditionTrue ,
638626 Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
@@ -660,22 +648,16 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
660648 return ret
661649}
662650
663- func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url , promQL string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
651+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
664652 risk , ok := risks [riskName ]
665653 if ! ok {
666654 return configv1.ConditionalUpdateRisk {
667655 Name : riskName ,
668656 Message : message ,
669657 URL : url ,
670- MatchingRules : []configv1.ClusterCondition {
671- {
672- Type : "PromQL" ,
673- PromQL : & configv1.PromQLClusterCondition {
674- PromQL : promQL ,
675- },
676- },
677- },
678- Conditions : []metav1.Condition {condition },
658+ // Always as the alert is firing
659+ MatchingRules : []configv1.ClusterCondition {{Type : "Always" }},
660+ Conditions : []metav1.Condition {condition },
679661 }
680662 }
681663
@@ -699,9 +681,21 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
699681 risks := risksInOrder (riskVersions )
700682 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
701683
702- if err := sanityCheck (u .ConditionalUpdates , u .AlertRisks ); err != nil {
684+ var alertRisks []configv1.ConditionalUpdateRisk
685+ if u .ShouldReconcileAcceptRisks () {
686+ if evaluated , err := u .evaluateAlertRisks (ctx ); err != nil {
687+ klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
688+ } else {
689+ alertRisks = evaluated
690+ }
691+
692+ }
693+ u .attachAlertRisksToUpdates (alertRisks )
694+
695+ if err := sanityCheck (u .ConditionalUpdates ); err != nil {
703696 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
704697 }
698+
705699 for i , conditionalUpdate := range u .ConditionalUpdates {
706700 condition := evaluateConditionalUpdate (conditionalUpdate .Risks , u .AcceptRisks , u .ShouldReconcileAcceptRisks , u .RiskConditions )
707701
@@ -717,7 +711,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
717711 }
718712}
719713
720- func sanityCheck (updates []configv1.ConditionalUpdate , alertRisks []configv1. ConditionalUpdateRisk ) error {
714+ func sanityCheck (updates []configv1.ConditionalUpdate ) error {
721715 risks := map [string ]configv1.ConditionalUpdateRisk {}
722716 var errs []error
723717 for _ , update := range updates {
@@ -733,11 +727,6 @@ func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.Con
733727 }
734728 }
735729 }
736- for _ , alertRisk := range alertRisks {
737- if _ , ok := risks [alertRisk .Name ]; ok {
738- errs = append (errs , fmt .Errorf ("found alert risk and conditional update risk share the name: %s" , alertRisk .Name ))
739- }
740- }
741730 return utilerrors .NewAggregate (errs )
742731}
743732
@@ -759,6 +748,45 @@ func (u *availableUpdates) removeUpdate(image string) {
759748 }
760749}
761750
751+ func (u * availableUpdates ) attachAlertRisksToUpdates (alertRisks []configv1.ConditionalUpdateRisk ) {
752+ if u == nil || len (alertRisks ) == 0 {
753+ return
754+ }
755+ if u .RiskConditions == nil {
756+ u .RiskConditions = map [string ][]metav1.Condition {}
757+ }
758+ for _ , alertRisk := range alertRisks {
759+ u .RiskConditions [alertRisk .Name ] = alertRisk .Conditions
760+ }
761+ var conditionalUpdates []configv1.ConditionalUpdate
762+ for _ , update := range u .Updates {
763+ conditionalUpdates = append (conditionalUpdates , configv1.ConditionalUpdate {
764+ Release : update ,
765+ Risks : alertRisks ,
766+ })
767+ }
768+ u .Updates = nil
769+ for _ , conditionalUpdate := range u .ConditionalUpdates {
770+ for _ , alertRisk := range alertRisks {
771+ var found bool
772+ for _ , risk := range conditionalUpdate .Risks {
773+ if alertRisk .Name == risk .Name {
774+ found = true
775+ break
776+ }
777+ }
778+ if ! found {
779+ conditionalUpdate .Risks = append (conditionalUpdate .Risks , alertRisk )
780+ }
781+ }
782+ conditionalUpdates = append (conditionalUpdates , conditionalUpdate )
783+ }
784+ sort .Slice (conditionalUpdates , func (i , j int ) bool {
785+ return conditionalUpdates [i ].Release .Version < conditionalUpdates [j ].Release .Version
786+ })
787+ u .ConditionalUpdates = conditionalUpdates
788+ }
789+
762790func unknownExposureMessage (risk configv1.ConditionalUpdateRisk , err error ) string {
763791 template := `Could not evaluate exposure to update risk %s (%v)
764792 %s description: %s
@@ -782,7 +810,6 @@ const (
782810 recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
783811 recommendedReasonEvaluationFailed = "EvaluationFailed"
784812 recommendedReasonMultiple = "MultipleReasons"
785- //recommendedReasonAlertFiring = "AlertFiring"
786813
787814 // recommendedReasonExposed is used instead of the original name if it does
788815 // not match the pattern for a valid k8s condition reason.
0 commit comments