Skip to content

Commit 9b293cf

Browse files
committed
ToSquash: Associate alert risks to every update
1 parent 1203f1c commit 9b293cf

7 files changed

Lines changed: 538 additions & 328 deletions

File tree

pkg/cvo/availableupdates.go

Lines changed: 69 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
103103
if !acceptRisks.Equal(optrAvailableUpdates.AcceptRisks) {
104104
needsConditionalUpdateEval = true
105105
}
106+
// If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
106107
if !needsConditionalUpdateEval {
107108
klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
108109
return nil
@@ -164,11 +165,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
164165
optrAvailableUpdates.AcceptRisks = acceptRisks
165166
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
166167
optrAvailableUpdates.AlertGetter = optr.AlertGetter
167-
if optrAvailableUpdates.ShouldReconcileAcceptRisks() {
168-
if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil {
169-
klog.Errorf("Failed to evaluate alert conditions: %v", err)
170-
}
171-
}
172168
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
173169

174170
queueKey := optr.queueKey()
@@ -224,9 +220,6 @@ type availableUpdates struct {
224220
RiskConditions map[string][]metav1.Condition
225221

226222
AlertGetter AlertGetter
227-
228-
// AlertRisks stores the condition for every alert that might have impact on cluster update
229-
AlertRisks []configv1.ConditionalUpdateRisk
230223
}
231224

232225
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -332,7 +325,6 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
332325
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
333326
AcceptRisks: optr.availableUpdates.AcceptRisks,
334327
RiskConditions: optr.availableUpdates.RiskConditions,
335-
AlertRisks: optr.availableUpdates.AlertRisks,
336328
AlertGetter: optr.availableUpdates.AlertGetter,
337329
LastAttempt: optr.availableUpdates.LastAttempt,
338330
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
@@ -530,18 +522,15 @@ type AlertGetter interface {
530522
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
531523
}
532524

533-
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error {
525+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) ([]configv1.ConditionalUpdateRisk, error) {
534526
if u == nil || u.AlertGetter == nil {
535-
u.AlertRisks = nil
536-
return nil
527+
return nil, nil
537528
}
538529
alertsResult, err := u.AlertGetter.Get(ctx)
539530
if err != nil {
540-
return fmt.Errorf("failed to get alerts: %w", err)
531+
return nil, fmt.Errorf("failed to get alerts: %w", err)
541532
}
542-
543-
u.AlertRisks = alertsToRisks(alertsResult.Alerts)
544-
return nil
533+
return alertsToRisks(alertsResult.Alerts), nil
545534
}
546535

547536
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
@@ -579,22 +568,21 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
579568
}
580569

581570
var runbook string
571+
alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
582572
if runbook = string(alert.Annotations["runbook"]); runbook == "" {
583573
runbook = "<alert does not have a runbook_url annotation>"
574+
} else {
575+
alertURL = runbook
584576
}
585577

586578
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
587579

588-
// TODO (hongkliu):
589-
alertURL := "https://console.com/monitoring/alertrules/id"
590-
alertPromQL := "todo-expression"
591-
592580
severity := string(alert.Labels["severity"])
593581
if severity == "critical" {
594582
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
595583
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
596584
}
597-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
585+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
598586
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
599587
Status: metav1.ConditionTrue,
600588
Reason: fmt.Sprintf("Alert:%s", alert.State),
@@ -606,7 +594,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
606594

607595
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
608596
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
609-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
597+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
610598
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
611599
Status: metav1.ConditionTrue,
612600
Reason: internal.AlertConditionReason(string(alert.State)),
@@ -620,7 +608,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
620608
internal.HaveNodes.Has(alertName) ||
621609
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
622610
alertName == internal.AlertNameVMCannotBeEvicted {
623-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
611+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
624612
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
625613
Status: metav1.ConditionTrue,
626614
Reason: internal.AlertConditionReason(string(alert.State)),
@@ -632,7 +620,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
632620

633621
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
634622
if updatePrecheck == "true" {
635-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
623+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
636624
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
637625
Status: metav1.ConditionTrue,
638626
Reason: fmt.Sprintf("Alert:%s", alert.State),
@@ -660,22 +648,16 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
660648
return ret
661649
}
662650

663-
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
651+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
664652
risk, ok := risks[riskName]
665653
if !ok {
666654
return configv1.ConditionalUpdateRisk{
667655
Name: riskName,
668656
Message: message,
669657
URL: url,
670-
MatchingRules: []configv1.ClusterCondition{
671-
{
672-
Type: "PromQL",
673-
PromQL: &configv1.PromQLClusterCondition{
674-
PromQL: promQL,
675-
},
676-
},
677-
},
678-
Conditions: []metav1.Condition{condition},
658+
// Always as the alert is firing
659+
MatchingRules: []configv1.ClusterCondition{{Type: "Always"}},
660+
Conditions: []metav1.Condition{condition},
679661
}
680662
}
681663

@@ -699,9 +681,21 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
699681
risks := risksInOrder(riskVersions)
700682
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
701683

702-
if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil {
684+
var alertRisks []configv1.ConditionalUpdateRisk
685+
if u.ShouldReconcileAcceptRisks() {
686+
if evaluated, err := u.evaluateAlertRisks(ctx); err != nil {
687+
klog.Errorf("Failed to evaluate alert conditions: %v", err)
688+
} else {
689+
alertRisks = evaluated
690+
}
691+
692+
}
693+
u.attachAlertRisksToUpdates(alertRisks)
694+
695+
if err := sanityCheck(u.ConditionalUpdates); err != nil {
703696
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
704697
}
698+
705699
for i, conditionalUpdate := range u.ConditionalUpdates {
706700
condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions)
707701

@@ -717,7 +711,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
717711
}
718712
}
719713

720-
func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error {
714+
func sanityCheck(updates []configv1.ConditionalUpdate) error {
721715
risks := map[string]configv1.ConditionalUpdateRisk{}
722716
var errs []error
723717
for _, update := range updates {
@@ -733,11 +727,6 @@ func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.Con
733727
}
734728
}
735729
}
736-
for _, alertRisk := range alertRisks {
737-
if _, ok := risks[alertRisk.Name]; ok {
738-
errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name))
739-
}
740-
}
741730
return utilerrors.NewAggregate(errs)
742731
}
743732

@@ -759,6 +748,45 @@ func (u *availableUpdates) removeUpdate(image string) {
759748
}
760749
}
761750

751+
func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) {
752+
if u == nil || len(alertRisks) == 0 {
753+
return
754+
}
755+
if u.RiskConditions == nil {
756+
u.RiskConditions = map[string][]metav1.Condition{}
757+
}
758+
for _, alertRisk := range alertRisks {
759+
u.RiskConditions[alertRisk.Name] = alertRisk.Conditions
760+
}
761+
var conditionalUpdates []configv1.ConditionalUpdate
762+
for _, update := range u.Updates {
763+
conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{
764+
Release: update,
765+
Risks: alertRisks,
766+
})
767+
}
768+
u.Updates = nil
769+
for _, conditionalUpdate := range u.ConditionalUpdates {
770+
for _, alertRisk := range alertRisks {
771+
var found bool
772+
for _, risk := range conditionalUpdate.Risks {
773+
if alertRisk.Name == risk.Name {
774+
found = true
775+
break
776+
}
777+
}
778+
if !found {
779+
conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk)
780+
}
781+
}
782+
conditionalUpdates = append(conditionalUpdates, conditionalUpdate)
783+
}
784+
sort.Slice(conditionalUpdates, func(i, j int) bool {
785+
return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version
786+
})
787+
u.ConditionalUpdates = conditionalUpdates
788+
}
789+
762790
func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string {
763791
template := `Could not evaluate exposure to update risk %s (%v)
764792
%s description: %s
@@ -782,7 +810,6 @@ const (
782810
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
783811
recommendedReasonEvaluationFailed = "EvaluationFailed"
784812
recommendedReasonMultiple = "MultipleReasons"
785-
//recommendedReasonAlertFiring = "AlertFiring"
786813

787814
// recommendedReasonExposed is used instead of the original name if it does
788815
// not match the pattern for a valid k8s condition reason.

0 commit comments

Comments
 (0)