Skip to content

Commit 239bbaf

Browse files
committed
ToSquash: Associate alert risks to every update
1 parent ee02faa commit 239bbaf

7 files changed

Lines changed: 538 additions & 328 deletions

File tree

pkg/cvo/availableupdates.go

Lines changed: 69 additions & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
100100
break
101101
}
102102
}
103+
// If risks from alerts, conditional updates might be stale for maximally 2 x minimumUpdateCheckInterval
103104
if !needsConditionalUpdateEval {
104105
klog.V(2).Infof("Available updates were recently retrieved, with less than %s elapsed since %s, will try later.", optr.minimumUpdateCheckInterval, optrAvailableUpdates.LastAttempt.Format(time.RFC3339))
105106
return nil
@@ -161,11 +162,6 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
161162
}
162163
}
163164

164-
if optrAvailableUpdates.ShouldReconcileAcceptRisks() {
165-
if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil {
166-
klog.Errorf("Failed to evaluate alert conditions: %v", err)
167-
}
168-
}
169165
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
170166

171167
queueKey := optr.queueKey()
@@ -221,9 +217,6 @@ type availableUpdates struct {
221217
RiskConditions map[string][]metav1.Condition
222218

223219
AlertGetter AlertGetter
224-
225-
// AlertRisks stores the condition for every alert that might have impact on cluster update
226-
AlertRisks []configv1.ConditionalUpdateRisk
227220
}
228221

229222
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -329,7 +322,6 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
329322
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
330323
AcceptRisks: optr.availableUpdates.AcceptRisks,
331324
RiskConditions: optr.availableUpdates.RiskConditions,
332-
AlertRisks: optr.availableUpdates.AlertRisks,
333325
AlertGetter: optr.availableUpdates.AlertGetter,
334326
LastAttempt: optr.availableUpdates.LastAttempt,
335327
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
@@ -527,18 +519,15 @@ type AlertGetter interface {
527519
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
528520
}
529521

530-
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error {
522+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) ([]configv1.ConditionalUpdateRisk, error) {
531523
if u == nil || u.AlertGetter == nil {
532-
u.AlertRisks = nil
533-
return nil
524+
return nil, nil
534525
}
535526
alertsResult, err := u.AlertGetter.Get(ctx)
536527
if err != nil {
537-
return fmt.Errorf("failed to get alerts: %w", err)
528+
return nil, fmt.Errorf("failed to get alerts: %w", err)
538529
}
539-
540-
u.AlertRisks = alertsToRisks(alertsResult.Alerts)
541-
return nil
530+
return alertsToRisks(alertsResult.Alerts), nil
542531
}
543532

544533
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
@@ -576,22 +565,21 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
576565
}
577566

578567
var runbook string
568+
alertURL := "https://github.com/openshift/runbooks/tree/master/alerts?runbook=notfound"
579569
if runbook = string(alert.Annotations["runbook"]); runbook == "" {
580570
runbook = "<alert does not have a runbook_url annotation>"
571+
} else {
572+
alertURL = runbook
581573
}
582574

583575
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
584576

585-
// TODO (hongkliu):
586-
alertURL := "https://console.com/monitoring/alertrules/id"
587-
alertPromQL := "todo-expression"
588-
589577
severity := string(alert.Labels["severity"])
590578
if severity == "critical" {
591579
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
592580
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
593581
}
594-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
582+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
595583
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
596584
Status: metav1.ConditionTrue,
597585
Reason: fmt.Sprintf("Alert:%s", alert.State),
@@ -603,7 +591,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
603591

604592
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
605593
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
606-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
594+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
607595
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
608596
Status: metav1.ConditionTrue,
609597
Reason: internal.AlertConditionReason(string(alert.State)),
@@ -617,7 +605,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
617605
internal.HaveNodes.Has(alertName) ||
618606
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
619607
alertName == internal.AlertNameVMCannotBeEvicted {
620-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
608+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
621609
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
622610
Status: metav1.ConditionTrue,
623611
Reason: internal.AlertConditionReason(string(alert.State)),
@@ -629,7 +617,7 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
629617

630618
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
631619
if updatePrecheck == "true" {
632-
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
620+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, metav1.Condition{
633621
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
634622
Status: metav1.ConditionTrue,
635623
Reason: fmt.Sprintf("Alert:%s", alert.State),
@@ -657,22 +645,16 @@ func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk
657645
return ret
658646
}
659647

660-
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
648+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
661649
risk, ok := risks[riskName]
662650
if !ok {
663651
return configv1.ConditionalUpdateRisk{
664652
Name: riskName,
665653
Message: message,
666654
URL: url,
667-
MatchingRules: []configv1.ClusterCondition{
668-
{
669-
Type: "PromQL",
670-
PromQL: &configv1.PromQLClusterCondition{
671-
PromQL: promQL,
672-
},
673-
},
674-
},
675-
Conditions: []metav1.Condition{condition},
655+
// Always as the alert is firing
656+
MatchingRules: []configv1.ClusterCondition{{Type: "Always"}},
657+
Conditions: []metav1.Condition{condition},
676658
}
677659
}
678660

@@ -696,9 +678,21 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
696678
risks := risksInOrder(riskVersions)
697679
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
698680

699-
if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil {
681+
var alertRisks []configv1.ConditionalUpdateRisk
682+
if u.ShouldReconcileAcceptRisks() {
683+
if evaluated, err := u.evaluateAlertRisks(ctx); err != nil {
684+
klog.Errorf("Failed to evaluate alert conditions: %v", err)
685+
} else {
686+
alertRisks = evaluated
687+
}
688+
689+
}
690+
u.attachAlertRisksToUpdates(alertRisks)
691+
692+
if err := sanityCheck(u.ConditionalUpdates); err != nil {
700693
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
701694
}
695+
702696
for i, conditionalUpdate := range u.ConditionalUpdates {
703697
condition := evaluateConditionalUpdate(conditionalUpdate.Risks, u.AcceptRisks, u.ShouldReconcileAcceptRisks, u.RiskConditions)
704698

@@ -714,7 +708,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
714708
}
715709
}
716710

717-
func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error {
711+
func sanityCheck(updates []configv1.ConditionalUpdate) error {
718712
risks := map[string]configv1.ConditionalUpdateRisk{}
719713
var errs []error
720714
for _, update := range updates {
@@ -730,11 +724,6 @@ func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.Con
730724
}
731725
}
732726
}
733-
for _, alertRisk := range alertRisks {
734-
if _, ok := risks[alertRisk.Name]; ok {
735-
errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name))
736-
}
737-
}
738727
return utilerrors.NewAggregate(errs)
739728
}
740729

@@ -756,6 +745,45 @@ func (u *availableUpdates) removeUpdate(image string) {
756745
}
757746
}
758747

748+
func (u *availableUpdates) attachAlertRisksToUpdates(alertRisks []configv1.ConditionalUpdateRisk) {
749+
if u == nil || len(alertRisks) == 0 {
750+
return
751+
}
752+
if u.RiskConditions == nil {
753+
u.RiskConditions = map[string][]metav1.Condition{}
754+
}
755+
for _, alertRisk := range alertRisks {
756+
u.RiskConditions[alertRisk.Name] = alertRisk.Conditions
757+
}
758+
var conditionalUpdates []configv1.ConditionalUpdate
759+
for _, update := range u.Updates {
760+
conditionalUpdates = append(conditionalUpdates, configv1.ConditionalUpdate{
761+
Release: update,
762+
Risks: alertRisks,
763+
})
764+
}
765+
u.Updates = nil
766+
for _, conditionalUpdate := range u.ConditionalUpdates {
767+
for _, alertRisk := range alertRisks {
768+
var found bool
769+
for _, risk := range conditionalUpdate.Risks {
770+
if alertRisk.Name == risk.Name {
771+
found = true
772+
break
773+
}
774+
}
775+
if !found {
776+
conditionalUpdate.Risks = append(conditionalUpdate.Risks, alertRisk)
777+
}
778+
}
779+
conditionalUpdates = append(conditionalUpdates, conditionalUpdate)
780+
}
781+
sort.Slice(conditionalUpdates, func(i, j int) bool {
782+
return conditionalUpdates[i].Release.Version < conditionalUpdates[j].Release.Version
783+
})
784+
u.ConditionalUpdates = conditionalUpdates
785+
}
786+
759787
func unknownExposureMessage(risk configv1.ConditionalUpdateRisk, err error) string {
760788
template := `Could not evaluate exposure to update risk %s (%v)
761789
%s description: %s
@@ -779,7 +807,6 @@ const (
779807
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
780808
recommendedReasonEvaluationFailed = "EvaluationFailed"
781809
recommendedReasonMultiple = "MultipleReasons"
782-
//recommendedReasonAlertFiring = "AlertFiring"
783810

784811
// recommendedReasonExposed is used instead of the original name if it does
785812
// not match the pattern for a valid k8s condition reason.

0 commit comments

Comments
 (0)