Skip to content

Commit d7c120e

Browse files
committed
OTA-1813: Populate risks from alerts
1 parent 602f35f commit d7c120e

11 files changed

Lines changed: 1061 additions & 62 deletions

File tree

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package promql
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/api"
10+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
11+
"github.com/prometheus/common/config"
12+
13+
"k8s.io/klog/v2"
14+
15+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
16+
)
17+
18+
type Getter interface {
19+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
20+
}
21+
22+
func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter {
23+
p := NewPromQL(promQLTarget)
24+
condition := p.Condition
25+
v, ok := condition.(*PromQL)
26+
if !ok {
27+
panic("invalid condition type")
28+
}
29+
return &ocAlertGetter{promQL: v, expiration: 1 * time.Minute}
30+
}
31+
32+
type ocAlertGetter struct {
33+
promQL *PromQL
34+
35+
mutex sync.Mutex
36+
cached prometheusv1.AlertsResult
37+
expiration time.Duration
38+
lastRefresh time.Time
39+
}
40+
41+
func (o *ocAlertGetter) Get(ctx context.Context) (prometheusv1.AlertsResult, error) {
42+
if time.Now().After(o.lastRefresh.Add(o.expiration)) {
43+
if err := o.refresh(ctx); err != nil {
44+
klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err)
45+
}
46+
}
47+
return o.cached, nil
48+
}
49+
50+
func (o *ocAlertGetter) refresh(ctx context.Context) error {
51+
o.mutex.Lock()
52+
defer o.mutex.Unlock()
53+
54+
klog.Info("refresh alerts ...")
55+
p := o.promQL
56+
host, err := p.Host(ctx)
57+
if err != nil {
58+
return fmt.Errorf("failure determine thanos IP: %w", err)
59+
}
60+
p.url.Host = host
61+
clientConfig := api.Config{Address: p.url.String()}
62+
63+
if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil {
64+
clientConfig.RoundTripper = roundTripper
65+
} else {
66+
return fmt.Errorf("creating PromQL round-tripper: %w", err)
67+
}
68+
69+
promqlClient, err := api.NewClient(clientConfig)
70+
if err != nil {
71+
return fmt.Errorf("creating PromQL client: %w", err)
72+
}
73+
74+
client := &statusCodeNotImplementedForPostClient{
75+
client: promqlClient,
76+
}
77+
78+
v1api := prometheusv1.NewAPI(client)
79+
80+
queryContext := ctx
81+
if p.QueryTimeout > 0 {
82+
var cancel context.CancelFunc
83+
queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout)
84+
defer cancel()
85+
}
86+
87+
r, err := v1api.Alerts(queryContext)
88+
if err != nil {
89+
return fmt.Errorf("failed to get alerts: %w", err)
90+
}
91+
o.cached = r
92+
o.lastRefresh = time.Now()
93+
klog.Infof("refreshed: %d alerts", len(o.cached.Alerts))
94+
return nil
95+
}

pkg/cvo/availableupdates.go

Lines changed: 191 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/go-cmp/cmp"
1616
"github.com/google/go-cmp/cmp/cmpopts"
1717
"github.com/google/uuid"
18+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819

1920
"k8s.io/apimachinery/pkg/api/equality"
2021
"k8s.io/apimachinery/pkg/api/meta"
@@ -145,6 +146,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
145146
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
146147
optrAvailableUpdates.AcceptRisks = acceptRisks
147148
optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry
149+
optrAvailableUpdates.AlertGetter = optr.AlertGetter
148150
optrAvailableUpdates.Condition = condition
149151

150152
responseFailed := (condition.Type == configv1.RetrievedUpdates &&
@@ -159,6 +161,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
159161
}
160162
}
161163

164+
if optrAvailableUpdates.ShouldReconcileAcceptRisks() {
165+
if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil {
166+
klog.Errorf("Failed to evaluate alert conditions: %v", err)
167+
}
168+
}
162169
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
163170

164171
queueKey := optr.queueKey()
@@ -212,6 +219,11 @@ type availableUpdates struct {
212219

213220
// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
214221
RiskConditions map[string][]metav1.Condition
222+
223+
AlertGetter AlertGetter
224+
225+
// AlertRisks stores the condition for every alert that might have impact on cluster update
226+
AlertRisks []configv1.ConditionalUpdateRisk
215227
}
216228

217229
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -317,6 +329,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
317329
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
318330
AcceptRisks: optr.availableUpdates.AcceptRisks,
319331
RiskConditions: optr.availableUpdates.RiskConditions,
332+
AlertRisks: optr.availableUpdates.AlertRisks,
333+
AlertGetter: optr.availableUpdates.AlertGetter,
320334
LastAttempt: optr.availableUpdates.LastAttempt,
321335
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
322336
Current: *optr.availableUpdates.Current.DeepCopy(),
@@ -509,6 +523,170 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
509523
}
510524
}
511525

526+
type AlertGetter interface {
527+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
528+
}
529+
530+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error {
531+
if u == nil || u.AlertGetter == nil {
532+
u.AlertRisks = nil
533+
return nil
534+
}
535+
alertsResult, err := u.AlertGetter.Get(ctx)
536+
if err != nil {
537+
return fmt.Errorf("failed to get alerts: %w", err)
538+
}
539+
540+
u.AlertRisks = alertsToRisks(alertsResult.Alerts)
541+
return nil
542+
}
543+
544+
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
545+
klog.V(2).Infof("Found %d alerts", len(alerts))
546+
risks := map[string]configv1.ConditionalUpdateRisk{}
547+
for _, alert := range alerts {
548+
var alertName string
549+
if alertName = string(alert.Labels["alertname"]); alertName == "" {
550+
continue
551+
}
552+
if alert.State == "pending" {
553+
continue
554+
}
555+
556+
var summary string
557+
if summary = string(alert.Annotations["summary"]); summary == "" {
558+
summary = alertName
559+
}
560+
if !strings.HasSuffix(summary, ".") {
561+
summary += "."
562+
}
563+
564+
var description string
565+
alertMessage := string(alert.Annotations["message"])
566+
alertDescription := string(alert.Annotations["description"])
567+
switch {
568+
case alertMessage != "" && alertDescription != "":
569+
description += " The alert description is: " + alertDescription + " | " + alertMessage
570+
case alertDescription != "":
571+
description += " The alert description is: " + alertDescription
572+
case alertMessage != "":
573+
description += " The alert description is: " + alertMessage
574+
default:
575+
description += " The alert has no description."
576+
}
577+
578+
var runbook string
579+
if runbook = string(alert.Annotations["runbook"]); runbook == "" {
580+
runbook = "<alert does not have a runbook_url annotation>"
581+
}
582+
583+
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
584+
585+
// TODO (hongkliu):
586+
alertURL := "https://console.com/monitoring/alertrules/id"
587+
alertPromQL := "todo-expression"
588+
589+
severity := string(alert.Labels["severity"])
590+
if severity == "critical" {
591+
if alertName == internal.AlertNamePodDisruptionBudgetLimit {
592+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
593+
}
594+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
595+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
596+
Status: metav1.ConditionTrue,
597+
Reason: fmt.Sprintf("Alert:%s", alert.State),
598+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting significant cluster issues worth investigating", details),
599+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
600+
})
601+
continue
602+
}
603+
604+
if alertName == internal.AlertNamePodDisruptionBudgetAtLimit {
605+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
606+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
607+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
608+
Status: metav1.ConditionTrue,
609+
Reason: internal.AlertConditionReason(string(alert.State)),
610+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which might slow node drains", details),
611+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
612+
})
613+
continue
614+
}
615+
616+
if internal.HavePullWaiting.Has(alertName) ||
617+
internal.HaveNodes.Has(alertName) ||
618+
alertName == internal.AlertNameVirtHandlerDaemonSetRolloutFailing ||
619+
alertName == internal.AlertNameVMCannotBeEvicted {
620+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
621+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
622+
Status: metav1.ConditionTrue,
623+
Reason: internal.AlertConditionReason(string(alert.State)),
624+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "which may slow workload redistribution during rolling node updates", details),
625+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
626+
})
627+
continue
628+
}
629+
630+
updatePrecheck := string(alert.Labels["openShiftUpdatePrecheck"])
631+
if updatePrecheck == "true" {
632+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
633+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
634+
Status: metav1.ConditionTrue,
635+
Reason: fmt.Sprintf("Alert:%s", alert.State),
636+
Message: internal.AlertConditionMessage(alertName, severity, string(alert.State), "suggesting issues worth investigating before updating the cluster", details),
637+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
638+
})
639+
continue
640+
}
641+
}
642+
643+
klog.V(2).Infof("Got %d risks", len(risks))
644+
if len(risks) == 0 {
645+
return nil
646+
}
647+
648+
var ret []configv1.ConditionalUpdateRisk
649+
var keys []string
650+
for k := range risks {
651+
keys = append(keys, k)
652+
}
653+
sort.Strings(keys)
654+
for _, k := range keys {
655+
ret = append(ret, risks[k])
656+
}
657+
return ret
658+
}
659+
660+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
661+
risk, ok := risks[riskName]
662+
if !ok {
663+
return configv1.ConditionalUpdateRisk{
664+
Name: riskName,
665+
Message: message,
666+
URL: url,
667+
MatchingRules: []configv1.ClusterCondition{
668+
{
669+
Type: "PromQL",
670+
PromQL: &configv1.PromQLClusterCondition{
671+
PromQL: promQL,
672+
},
673+
},
674+
},
675+
Conditions: []metav1.Condition{condition},
676+
}
677+
}
678+
679+
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
680+
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
681+
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
682+
c.LastTransitionTime = condition.LastTransitionTime
683+
}
684+
meta.SetStatusCondition(&risk.Conditions, *c)
685+
}
686+
687+
return risk
688+
}
689+
512690
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
513691
if u == nil {
514692
return
@@ -518,7 +696,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
518696
risks := risksInOrder(riskVersions)
519697
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
520698

521-
if err := sanityCheck(u.ConditionalUpdates); err != nil {
699+
if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil {
522700
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
523701
}
524702
for i, conditionalUpdate := range u.ConditionalUpdates {
@@ -536,7 +714,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
536714
}
537715
}
538716

539-
func sanityCheck(updates []configv1.ConditionalUpdate) error {
717+
func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error {
540718
risks := map[string]configv1.ConditionalUpdateRisk{}
541719
var errs []error
542720
for _, update := range updates {
@@ -552,6 +730,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
552730
}
553731
}
554732
}
733+
for _, alertRisk := range alertRisks {
734+
if _, ok := risks[alertRisk.Name]; ok {
735+
errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name))
736+
}
737+
}
555738
return utilerrors.NewAggregate(errs)
556739
}
557740

@@ -596,6 +779,7 @@ const (
596779
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
597780
recommendedReasonEvaluationFailed = "EvaluationFailed"
598781
recommendedReasonMultiple = "MultipleReasons"
782+
//recommendedReasonAlertFiring = "AlertFiring"
599783

600784
// recommendedReasonExposed is used instead of the original name if it does
601785
// not match the pattern for a valid k8s condition reason.
@@ -613,10 +797,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
613797
func newRecommendedReason(now, want string) string {
614798
switch {
615799
case now == recommendedReasonRisksNotExposed ||
616-
now == recommendedReasonAllExposedRisksAccepted ||
800+
now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
801+
now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
617802
now == want:
618803
return want
619-
case want == recommendedReasonRisksNotExposed:
804+
case want == recommendedReasonRisksNotExposed ||
805+
(now == recommendedReasonEvaluationFailed) && want == recommendedReasonAllExposedRisksAccepted ||
806+
now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed):
620807
return now
621808
default:
622809
return recommendedReasonMultiple
@@ -673,7 +860,6 @@ func evaluateConditionalUpdate(
673860
if len(errorMessages) > 0 {
674861
recommended.Message = strings.Join(errorMessages, "\n\n")
675862
}
676-
677863
return recommended
678864
}
679865

0 commit comments

Comments
 (0)