@@ -100,6 +100,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
100100 break
101101 }
102102 }
103+ // If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
103104 if ! needsConditionalUpdateEval {
104105 klog .V (2 ).Infof ("Available updates were recently retrieved, with less than %s elapsed since %s, will try later." , optr .minimumUpdateCheckInterval , optrAvailableUpdates .LastAttempt .Format (time .RFC3339 ))
105106 return nil
@@ -161,11 +162,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
161162 }
162163 }
163164
164- if optrAvailableUpdates .ShouldReconcileAcceptRisks () {
165- if err := optrAvailableUpdates .evaluateAlertRisks (ctx ); err != nil {
166- klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
167- }
168- }
169165 optrAvailableUpdates .evaluateConditionalUpdates (ctx )
170166
171167 queueKey := optr .queueKey ()
@@ -221,9 +217,6 @@ type availableUpdates struct {
221217 RiskConditions map [string ][]metav1.Condition
222218
223219 AlertGetter AlertGetter
224-
225- // AlertRisks stores the condition for every alert that might have impact on cluster update
226- AlertRisks []configv1.ConditionalUpdateRisk
227220}
228221
229222func (u * availableUpdates ) RecentlyAttempted (interval time.Duration ) bool {
@@ -329,7 +322,6 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
329322 ShouldReconcileAcceptRisks : optr .shouldReconcileAcceptRisks ,
330323 AcceptRisks : optr .availableUpdates .AcceptRisks ,
331324 RiskConditions : optr .availableUpdates .RiskConditions ,
332- AlertRisks : optr .availableUpdates .AlertRisks ,
333325 AlertGetter : optr .availableUpdates .AlertGetter ,
334326 LastAttempt : optr .availableUpdates .LastAttempt ,
335327 LastSyncOrConfigChange : optr .availableUpdates .LastSyncOrConfigChange ,
@@ -527,18 +519,15 @@ type AlertGetter interface {
527519 Get (ctx context.Context ) (prometheusv1.AlertsResult , error )
528520}
529521
530- func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) error {
522+ func (u * availableUpdates ) evaluateAlertRisks (ctx context.Context ) ([]configv1. ConditionalUpdateRisk , error ) {
531523 if u == nil || u .AlertGetter == nil {
532- u .AlertRisks = nil
533- return nil
524+ return nil , nil
534525 }
535526 alertsResult , err := u .AlertGetter .Get (ctx )
536527 if err != nil {
537- return fmt .Errorf ("failed to get alerts: %w" , err )
528+ return nil , fmt .Errorf ("failed to get alerts: %w" , err )
538529 }
539-
540- u .AlertRisks = alertsToRisks (alertsResult .Alerts )
541- return nil
530+ return alertsToRisks (alertsResult .Alerts ), nil
542531}
543532
544533func alertsToRisks (alerts []prometheusv1.Alert ) []configv1.ConditionalUpdateRisk {
@@ -576,22 +565,21 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
576565 }
577566
578567 var runbook string
568+ alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
579569 if runbook = string (alert .Annotations ["runbook" ]); runbook == "" {
580570 runbook = "<alert does not have a runbook_url annotation>"
571+ } else {
572+ alertURL = runbook
581573 }
582574
583575 details := fmt .Sprintf ("%s%s %s" , summary , description , runbook )
584576
585- // TODO (hongkliu):
586- alertURL := "https://console.com/monitoring/alertrules/id"
587- alertPromQL := "todo-expression"
588-
589577 severity := string (alert .Labels ["severity" ])
590578 if severity == "critical" {
591579 if alertName == internal .AlertNamePodDisruptionBudgetLimit {
592580 details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
593581 }
594- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
582+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
595583 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
596584 Status : metav1 .ConditionTrue ,
597585 Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
@@ -603,7 +591,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
603591
604592 if alertName == internal .AlertNamePodDisruptionBudgetAtLimit {
605593 details = fmt .Sprintf ("Namespace=%s, PodDisruptionBudget=%s. %s" , alert .Labels ["namespace" ], alert .Labels ["poddisruptionbudget" ], details )
606- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
594+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
607595 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
608596 Status : metav1 .ConditionTrue ,
609597 Reason : internal .AlertConditionReason (string (alert .State )),
@@ -617,7 +605,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
617605 internal .HaveNodes .Has (alertName ) ||
618606 alertName == internal .AlertNameVirtHandlerDaemonSetRolloutFailing ||
619607 alertName == internal .AlertNameVMCannotBeEvicted {
620- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
608+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
621609 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
622610 Status : metav1 .ConditionTrue ,
623611 Reason : internal .AlertConditionReason (string (alert .State )),
@@ -629,7 +617,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
629617
630618 updatePrecheck := string (alert .Labels ["openShiftUpdatePrecheck" ])
631619 if updatePrecheck == "true" {
632- risks [alertName ] = getRisk (risks , alertName , summary , alertURL , alertPromQL , metav1.Condition {
620+ risks [alertName ] = getRisk (risks , alertName , summary , alertURL , metav1.Condition {
633621 Type : internal .ConditionalUpdateRiskConditionTypeApplies ,
634622 Status : metav1 .ConditionTrue ,
635623 Reason : fmt .Sprintf ("Alert:%s" , alert .State ),
@@ -657,22 +645,16 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
657645 return ret
658646}
659647
660- func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url , promQL string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
648+ func getRisk (risks map [string ]configv1.ConditionalUpdateRisk , riskName , message , url string , condition metav1.Condition ) configv1.ConditionalUpdateRisk {
661649 risk , ok := risks [riskName ]
662650 if ! ok {
663651 return configv1.ConditionalUpdateRisk {
664652 Name : riskName ,
665653 Message : message ,
666654 URL : url ,
667- MatchingRules : []configv1.ClusterCondition {
668- {
669- Type : "PromQL" ,
670- PromQL : & configv1.PromQLClusterCondition {
671- PromQL : promQL ,
672- },
673- },
674- },
675- Conditions : []metav1.Condition {condition },
655+ // Always as the alert is firing
656+ MatchingRules : []configv1.ClusterCondition {{Type : "Always" }},
657+ Conditions : []metav1.Condition {condition },
676658 }
677659 }
678660
@@ -696,9 +678,21 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
696678 risks := risksInOrder (riskVersions )
697679 u .RiskConditions = loadRiskConditions (ctx , risks , riskVersions , u .ConditionRegistry )
698680
699- if err := sanityCheck (u .ConditionalUpdates , u .AlertRisks ); err != nil {
681+ var alertRisks []configv1.ConditionalUpdateRisk
682+ if u .ShouldReconcileAcceptRisks () {
683+ if evaluated , err := u .evaluateAlertRisks (ctx ); err != nil {
684+ klog .Errorf ("Failed to evaluate alert conditions: %v" , err )
685+ } else {
686+ alertRisks = evaluated
687+ }
688+
689+ }
690+ u .attachAlertRisksToUpdates (alertRisks )
691+
692+ if err := sanityCheck (u .ConditionalUpdates ); err != nil {
700693 klog .Errorf ("Sanity check failed which might impact risk evaluation: %v" , err )
701694 }
695+
702696 for i , conditionalUpdate := range u .ConditionalUpdates {
703697 condition := evaluateConditionalUpdate (conditionalUpdate .Risks , u .AcceptRisks , u .ShouldReconcileAcceptRisks , u .RiskConditions )
704698
@@ -714,7 +708,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
714708 }
715709}
716710
717- func sanityCheck (updates []configv1.ConditionalUpdate , alertRisks []configv1. ConditionalUpdateRisk ) error {
711+ func sanityCheck (updates []configv1.ConditionalUpdate ) error {
718712 risks := map [string ]configv1.ConditionalUpdateRisk {}
719713 var errs []error
720714 for _ , update := range updates {
@@ -730,11 +724,6 @@ func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.Con
730724 }
731725 }
732726 }
733- for _ , alertRisk := range alertRisks {
734- if _ , ok := risks [alertRisk .Name ]; ok {
735- errs = append (errs , fmt .Errorf ("found alert risk and conditional update risk share the name: %s" , alertRisk .Name ))
736- }
737- }
738727 return utilerrors .NewAggregate (errs )
739728}
740729
@@ -756,6 +745,45 @@ func (u *availableUpdates) removeUpdate(image string) {
756745 }
757746}
758747
748+ func (u * availableUpdates ) attachAlertRisksToUpdates (alertRisks []configv1.ConditionalUpdateRisk ) {
749+ if u == nil || len (alertRisks ) == 0 {
750+ return
751+ }
752+ if u .RiskConditions == nil {
753+ u .RiskConditions = map [string ][]metav1.Condition {}
754+ }
755+ for _ , alertRisk := range alertRisks {
756+ u .RiskConditions [alertRisk .Name ] = alertRisk .Conditions
757+ }
758+ var conditionalUpdates []configv1.ConditionalUpdate
759+ for _ , update := range u .Updates {
760+ conditionalUpdates = append (conditionalUpdates , configv1.ConditionalUpdate {
761+ Release : update ,
762+ Risks : alertRisks ,
763+ })
764+ }
765+ u .Updates = nil
766+ for _ , conditionalUpdate := range u .ConditionalUpdates {
767+ for _ , alertRisk := range alertRisks {
768+ var found bool
769+ for _ , risk := range conditionalUpdate .Risks {
770+ if alertRisk .Name == risk .Name {
771+ found = true
772+ break
773+ }
774+ }
775+ if ! found {
776+ conditionalUpdate .Risks = append (conditionalUpdate .Risks , alertRisk )
777+ }
778+ }
779+ conditionalUpdates = append (conditionalUpdates , conditionalUpdate )
780+ }
781+ sort .Slice (conditionalUpdates , func (i , j int ) bool {
782+ return conditionalUpdates [i ].Release .Version < conditionalUpdates [j ].Release .Version
783+ })
784+ u .ConditionalUpdates = conditionalUpdates
785+ }
786+
759787func unknownExposureMessage (risk configv1.ConditionalUpdateRisk , err error ) string {
760788 template := `Could not evaluate exposure to update risk %s (%v)
761789 %s description: %s
@@ -779,7 +807,6 @@ const (
779807 recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
780808 recommendedReasonEvaluationFailed = "EvaluationFailed"
781809 recommendedReasonMultiple = "MultipleReasons"
782- //recommendedReasonAlertFiring = "AlertFiring"
783810
784811 // recommendedReasonExposed is used instead of the original name if it does
785812 // not match the pattern for a valid k8s condition reason.
0 commit comments