Skip to content

Commit 60ff3f6

Browse files
committed
OTA-1813: Populate risks from alerts
1 parent 0fdf01d commit 60ff3f6

11 files changed

Lines changed: 1030 additions & 62 deletions

File tree

Lines changed: 95 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,95 @@
1+
package promql
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"sync"
7+
"time"
8+
9+
"github.com/prometheus/client_golang/api"
10+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
11+
"github.com/prometheus/common/config"
12+
13+
"k8s.io/klog/v2"
14+
15+
"github.com/openshift/cluster-version-operator/pkg/clusterconditions"
16+
)
17+
18+
type Getter interface {
19+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
20+
}
21+
22+
func NewAlertGetter(promQLTarget clusterconditions.PromQLTarget) Getter {
23+
p := NewPromQL(promQLTarget)
24+
condition := p.Condition
25+
v, ok := condition.(*PromQL)
26+
if !ok {
27+
panic("invalid condition type")
28+
}
29+
return &ocAlertGetter{promQL: v, expiration: 1 * time.Minute}
30+
}
31+
32+
type ocAlertGetter struct {
33+
promQL *PromQL
34+
35+
mutex sync.Mutex
36+
cached prometheusv1.AlertsResult
37+
expiration time.Duration
38+
lastRefresh time.Time
39+
}
40+
41+
func (o *ocAlertGetter) Get(ctx context.Context) (prometheusv1.AlertsResult, error) {
42+
if time.Now().After(o.lastRefresh.Add(o.expiration)) {
43+
if err := o.refresh(ctx); err != nil {
44+
klog.Errorf("Failed to refresh alerts, using stale cache instead: %v", err)
45+
}
46+
}
47+
return o.cached, nil
48+
}
49+
50+
func (o *ocAlertGetter) refresh(ctx context.Context) error {
51+
o.mutex.Lock()
52+
defer o.mutex.Unlock()
53+
54+
klog.Info("refresh alerts ...")
55+
p := o.promQL
56+
host, err := p.Host(ctx)
57+
if err != nil {
58+
return fmt.Errorf("failure determine thanos IP: %w", err)
59+
}
60+
p.url.Host = host
61+
clientConfig := api.Config{Address: p.url.String()}
62+
63+
if roundTripper, err := config.NewRoundTripperFromConfig(p.HTTPClientConfig, "cluster-conditions"); err == nil {
64+
clientConfig.RoundTripper = roundTripper
65+
} else {
66+
return fmt.Errorf("creating PromQL round-tripper: %w", err)
67+
}
68+
69+
promqlClient, err := api.NewClient(clientConfig)
70+
if err != nil {
71+
return fmt.Errorf("creating PromQL client: %w", err)
72+
}
73+
74+
client := &statusCodeNotImplementedForPostClient{
75+
client: promqlClient,
76+
}
77+
78+
v1api := prometheusv1.NewAPI(client)
79+
80+
queryContext := ctx
81+
if p.QueryTimeout > 0 {
82+
var cancel context.CancelFunc
83+
queryContext, cancel = context.WithTimeout(ctx, p.QueryTimeout)
84+
defer cancel()
85+
}
86+
87+
r, err := v1api.Alerts(queryContext)
88+
if err != nil {
89+
return fmt.Errorf("failed to get alerts: %w", err)
90+
}
91+
o.cached = r
92+
o.lastRefresh = time.Now()
93+
klog.Infof("refreshed: %d alerts", len(o.cached.Alerts))
94+
return nil
95+
}

pkg/cvo/availableupdates.go

Lines changed: 187 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ import (
1515
"github.com/google/go-cmp/cmp"
1616
"github.com/google/go-cmp/cmp/cmpopts"
1717
"github.com/google/uuid"
18+
prometheusv1 "github.com/prometheus/client_golang/api/prometheus/v1"
1819

1920
"k8s.io/apimachinery/pkg/api/equality"
2021
"k8s.io/apimachinery/pkg/api/meta"
@@ -145,6 +146,7 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
145146
optrAvailableUpdates.ShouldReconcileAcceptRisks = optr.shouldReconcileAcceptRisks
146147
optrAvailableUpdates.AcceptRisks = acceptRisks
147148
optrAvailableUpdates.ConditionRegistry = optr.conditionRegistry
149+
optrAvailableUpdates.AlertGetter = optr.AlertGetter
148150
optrAvailableUpdates.Condition = condition
149151

150152
responseFailed := (condition.Type == configv1.RetrievedUpdates &&
@@ -159,6 +161,11 @@ func (optr *Operator) syncAvailableUpdates(ctx context.Context, config *configv1
159161
}
160162
}
161163

164+
if optrAvailableUpdates.ShouldReconcileAcceptRisks() {
165+
if err := optrAvailableUpdates.evaluateAlertRisks(ctx); err != nil {
166+
klog.Errorf("Failed to evaluate alert conditions: %v", err)
167+
}
168+
}
162169
optrAvailableUpdates.evaluateConditionalUpdates(ctx)
163170

164171
queueKey := optr.queueKey()
@@ -212,6 +219,11 @@ type availableUpdates struct {
212219

213220
// RiskConditions stores the condition for every risk (name, url, message, matchingRules).
214221
RiskConditions map[string][]metav1.Condition
222+
223+
AlertGetter AlertGetter
224+
225+
// AlertRisks stores the condition for every alert that might have impact on cluster update
226+
AlertRisks []configv1.ConditionalUpdateRisk
215227
}
216228

217229
func (u *availableUpdates) RecentlyAttempted(interval time.Duration) bool {
@@ -317,6 +329,8 @@ func (optr *Operator) getAvailableUpdates() *availableUpdates {
317329
ShouldReconcileAcceptRisks: optr.shouldReconcileAcceptRisks,
318330
AcceptRisks: optr.availableUpdates.AcceptRisks,
319331
RiskConditions: optr.availableUpdates.RiskConditions,
332+
AlertRisks: optr.availableUpdates.AlertRisks,
333+
AlertGetter: optr.availableUpdates.AlertGetter,
320334
LastAttempt: optr.availableUpdates.LastAttempt,
321335
LastSyncOrConfigChange: optr.availableUpdates.LastSyncOrConfigChange,
322336
Current: *optr.availableUpdates.Current.DeepCopy(),
@@ -513,6 +527,166 @@ func calculateAvailableUpdatesStatus(ctx context.Context, clusterID string, tran
513527
}
514528
}
515529

530+
type AlertGetter interface {
531+
Get(ctx context.Context) (prometheusv1.AlertsResult, error)
532+
}
533+
534+
func (u *availableUpdates) evaluateAlertRisks(ctx context.Context) error {
535+
if u == nil || u.AlertGetter == nil {
536+
u.AlertRisks = nil
537+
return nil
538+
}
539+
alertsResult, err := u.AlertGetter.Get(ctx)
540+
if err != nil {
541+
return fmt.Errorf("failed to get alerts: %w", err)
542+
}
543+
544+
u.AlertRisks = alertsToRisks(alertsResult.Alerts)
545+
return nil
546+
}
547+
548+
func alertsToRisks(alerts []prometheusv1.Alert) []configv1.ConditionalUpdateRisk {
549+
klog.V(2).Infof("Found %d alerts", len(alerts))
550+
risks := map[string]configv1.ConditionalUpdateRisk{}
551+
for _, alert := range alerts {
552+
var alertName string
553+
if alertName = string(alert.Labels["alertname"]); alertName == "" {
554+
continue
555+
}
556+
if alert.State == "pending" {
557+
continue
558+
}
559+
560+
var summary string
561+
if summary = string(alert.Annotations["summary"]); summary == "" {
562+
summary = alertName
563+
}
564+
if !strings.HasSuffix(summary, ".") {
565+
summary += "."
566+
}
567+
568+
var description string
569+
alertMessage := string(alert.Annotations["message"])
570+
alertDescription := string(alert.Annotations["description"])
571+
switch {
572+
case alertMessage != "" && alertDescription != "":
573+
description += " The alert description is: " + alertDescription + " | " + alertMessage
574+
case alertDescription != "":
575+
description += " The alert description is: " + alertDescription
576+
case alertMessage != "":
577+
description += " The alert description is: " + alertMessage
578+
default:
579+
description += " The alert has no description."
580+
}
581+
582+
var runbook string
583+
if runbook = string(alert.Annotations["runbook"]); runbook == "" {
584+
runbook = "<alert does not have a runbook_url annotation>"
585+
}
586+
587+
details := fmt.Sprintf("%s%s %s", summary, description, runbook)
588+
589+
// TODO (hongkliu):
590+
alertURL := "https://console.com/monitoring/alertrules/id"
591+
alertPromQL := "todo-expression"
592+
593+
severity := string(alert.Labels["severity"])
594+
if severity == "critical" {
595+
if alertName == "PodDisruptionBudgetLimit" {
596+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
597+
}
598+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
599+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
600+
Status: metav1.ConditionTrue,
601+
Reason: fmt.Sprintf("Alert:%s", alert.State),
602+
Message: fmt.Sprintf("%s alert %s %s, suggesting significant cluster issues worth investigating. %s", severity, alertName, alert.State, details),
603+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
604+
})
605+
continue
606+
}
607+
608+
if alertName == "PodDisruptionBudgetAtLimit" {
609+
details = fmt.Sprintf("Namespace=%s, PodDisruptionBudget=%s. %s", alert.Labels["namespace"], alert.Labels["poddisruptionbudget"], details)
610+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
611+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
612+
Status: metav1.ConditionTrue,
613+
Reason: internal.AlertConditionReason(string(alert.State)),
614+
Message: fmt.Sprintf("%s alert %s %s, which might slow node drains. %s", severity, alertName, alert.State, details),
615+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
616+
})
617+
continue
618+
}
619+
620+
if alertName == "KubeContainerWaiting" {
621+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
622+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
623+
Status: metav1.ConditionTrue,
624+
Reason: internal.AlertConditionReason(string(alert.State)),
625+
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", severity, alertName, alert.State, details),
626+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
627+
})
628+
continue
629+
}
630+
631+
if alertName == "KubeNodeNotReady" || alertName == "KubeNodeReadinessFlapping" || alertName == "KubeNodeUnreachable" {
632+
risks[alertName] = getRisk(risks, alertName, summary, alertURL, alertPromQL, metav1.Condition{
633+
Type: internal.ConditionalUpdateRiskConditionTypeApplies,
634+
Status: metav1.ConditionTrue,
635+
Reason: internal.AlertConditionReason(string(alert.State)),
636+
Message: fmt.Sprintf("%s alert %s %s, which may slow workload redistribution during rolling node updates. %s", severity, alertName, alert.State, details),
637+
LastTransitionTime: metav1.NewTime(alert.ActiveAt),
638+
})
639+
continue
640+
}
641+
}
642+
643+
klog.V(2).Infof("Got %d risks", len(risks))
644+
if len(risks) == 0 {
645+
return nil
646+
}
647+
648+
var ret []configv1.ConditionalUpdateRisk
649+
var keys []string
650+
for k := range risks {
651+
keys = append(keys, k)
652+
}
653+
sort.Strings(keys)
654+
for _, k := range keys {
655+
ret = append(ret, risks[k])
656+
}
657+
return ret
658+
}
659+
660+
func getRisk(risks map[string]configv1.ConditionalUpdateRisk, riskName, message, url, promQL string, condition metav1.Condition) configv1.ConditionalUpdateRisk {
661+
risk, ok := risks[riskName]
662+
if !ok {
663+
return configv1.ConditionalUpdateRisk{
664+
Name: riskName,
665+
Message: message,
666+
URL: url,
667+
MatchingRules: []configv1.ClusterCondition{
668+
{
669+
Type: "PromQL",
670+
PromQL: &configv1.PromQLClusterCondition{
671+
PromQL: promQL,
672+
},
673+
},
674+
},
675+
Conditions: []metav1.Condition{condition},
676+
}
677+
}
678+
679+
if c := meta.FindStatusCondition(risk.Conditions, condition.Type); c != nil {
680+
c.Message = fmt.Sprintf("%s; %s", c.Message, condition.Message)
681+
if c.LastTransitionTime.After(condition.LastTransitionTime.Time) {
682+
c.LastTransitionTime = condition.LastTransitionTime
683+
}
684+
meta.SetStatusCondition(&risk.Conditions, *c)
685+
}
686+
687+
return risk
688+
}
689+
516690
func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
517691
if u == nil {
518692
return
@@ -522,7 +696,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
522696
risks := risksInOrder(riskVersions)
523697
u.RiskConditions = loadRiskConditions(ctx, risks, riskVersions, u.ConditionRegistry)
524698

525-
if err := sanityCheck(u.ConditionalUpdates); err != nil {
699+
if err := sanityCheck(u.ConditionalUpdates, u.AlertRisks); err != nil {
526700
klog.Errorf("Sanity check failed which might impact risk evaluation: %v", err)
527701
}
528702
for i, conditionalUpdate := range u.ConditionalUpdates {
@@ -540,7 +714,7 @@ func (u *availableUpdates) evaluateConditionalUpdates(ctx context.Context) {
540714
}
541715
}
542716

543-
func sanityCheck(updates []configv1.ConditionalUpdate) error {
717+
func sanityCheck(updates []configv1.ConditionalUpdate, alertRisks []configv1.ConditionalUpdateRisk) error {
544718
risks := map[string]configv1.ConditionalUpdateRisk{}
545719
var errs []error
546720
for _, update := range updates {
@@ -556,6 +730,11 @@ func sanityCheck(updates []configv1.ConditionalUpdate) error {
556730
}
557731
}
558732
}
733+
for _, alertRisk := range alertRisks {
734+
if _, ok := risks[alertRisk.Name]; ok {
735+
errs = append(errs, fmt.Errorf("found alert risk and conditional update risk share the name: %s", alertRisk.Name))
736+
}
737+
}
559738
return utilerrors.NewAggregate(errs)
560739
}
561740

@@ -600,6 +779,7 @@ const (
600779
recommendedReasonAllExposedRisksAccepted = "AllExposedRisksAccepted"
601780
recommendedReasonEvaluationFailed = "EvaluationFailed"
602781
recommendedReasonMultiple = "MultipleReasons"
782+
//recommendedReasonAlertFiring = "AlertFiring"
603783

604784
// recommendedReasonExposed is used instead of the original name if it does
605785
// not match the pattern for a valid k8s condition reason.
@@ -617,10 +797,13 @@ var reasonPattern = regexp.MustCompile(`^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$
617797
func newRecommendedReason(now, want string) string {
618798
switch {
619799
case now == recommendedReasonRisksNotExposed ||
620-
now == recommendedReasonAllExposedRisksAccepted ||
800+
now == recommendedReasonAllExposedRisksAccepted && want != recommendedReasonRisksNotExposed ||
801+
now == recommendedReasonEvaluationFailed && want == recommendedReasonExposed ||
621802
now == want:
622803
return want
623-
case want == recommendedReasonRisksNotExposed:
804+
case want == recommendedReasonRisksNotExposed ||
805+
(now == recommendedReasonEvaluationFailed) && want == recommendedReasonAllExposedRisksAccepted ||
806+
now == recommendedReasonExposed && (want == recommendedReasonAllExposedRisksAccepted || want == recommendedReasonEvaluationFailed):
624807
return now
625808
default:
626809
return recommendedReasonMultiple
@@ -677,7 +860,6 @@ func evaluateConditionalUpdate(
677860
if len(errorMessages) > 0 {
678861
recommended.Message = strings.Join(errorMessages, "\n\n")
679862
}
680-
681863
return recommended
682864
}
683865

0 commit comments

Comments
 (0)