Skip to content

Commit b5e2104

Browse files
committed
feat(dispatch): add group markers
This change adds group markers to the dispatcher. Group markers replace the global marker and are used to track the state of alerts in an aggregation group. This change touches many components of the alertmanager. Group markers are passed to the notifiers and then inhibitor and silencer using context. The API has no breaking changes: - /alerts uses a temporary marker to track the state of alerts - /alerts/groups returns the group markers The metrics are also updated to use group markers. The `alertmanager_alerts` metric is moved to dispatcher. The `alertmanager_marked_alerts` metric is removed. By default it behaves the same as before, by aggregating all alerts in the groups. Enabling `group-key-in-metrics` flag will cause the metrics to be grouped by `group_key`. Signed-off-by: Siavash Safi <siavash@cloudflare.com>
1 parent 710984e commit b5e2104

30 files changed

Lines changed: 975 additions & 908 deletions

api/api.go

Lines changed: 0 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -59,9 +59,6 @@ type Options struct {
5959
Alerts provider.Alerts
6060
// Silences to be used by the API. Mandatory.
6161
Silences *silence.Silences
62-
// AlertStatusFunc is used be the API to retrieve the AlertStatus of an
63-
// alert. Mandatory.
64-
AlertStatusFunc func(model.Fingerprint) types.AlertStatus
6562
// GroupMutedFunc is used be the API to know if an alert is muted.
6663
// Mandatory.
6764
GroupMutedFunc func(routeID, groupKey string) ([]string, bool)
@@ -95,9 +92,6 @@ func (o Options) validate() error {
9592
if o.Silences == nil {
9693
return errors.New("mandatory field Silences not set")
9794
}
98-
if o.AlertStatusFunc == nil {
99-
return errors.New("mandatory field AlertStatusFunc not set")
100-
}
10195
if o.GroupMutedFunc == nil {
10296
return errors.New("mandatory field GroupMutedFunc not set")
10397
}
@@ -125,7 +119,6 @@ func New(opts Options) (*API, error) {
125119
v2, err := apiv2.NewAPI(
126120
opts.Alerts,
127121
opts.GroupFunc,
128-
opts.AlertStatusFunc,
129122
opts.GroupMutedFunc,
130123
opts.Silences,
131124
opts.Peer,

api/v2/api.go

Lines changed: 15 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@ import (
3636
"go.opentelemetry.io/otel"
3737
"go.opentelemetry.io/otel/codes"
3838

39+
"github.com/prometheus/alertmanager/alert"
3940
"github.com/prometheus/alertmanager/api/metrics"
4041
open_api_models "github.com/prometheus/alertmanager/api/v2/models"
4142
"github.com/prometheus/alertmanager/api/v2/restapi"
@@ -48,12 +49,12 @@ import (
4849
"github.com/prometheus/alertmanager/cluster"
4950
"github.com/prometheus/alertmanager/config"
5051
"github.com/prometheus/alertmanager/dispatch"
52+
"github.com/prometheus/alertmanager/marker"
5153
"github.com/prometheus/alertmanager/matcher/compat"
5254
"github.com/prometheus/alertmanager/pkg/labels"
5355
"github.com/prometheus/alertmanager/provider"
5456
"github.com/prometheus/alertmanager/silence"
5557
"github.com/prometheus/alertmanager/silence/silencepb"
56-
"github.com/prometheus/alertmanager/types"
5758
)
5859

5960
var tracer = otel.Tracer("github.com/prometheus/alertmanager/api/v2")
@@ -64,7 +65,6 @@ type API struct {
6465
silences *silence.Silences
6566
alerts provider.Alerts
6667
alertGroups groupsFn
67-
getAlertStatus getAlertStatusFn
6868
groupMutedFunc groupMutedFunc
6969
uptime time.Time
7070

@@ -83,17 +83,15 @@ type API struct {
8383
}
8484

8585
type (
86-
groupsFn func(context.Context, func(*dispatch.Route) bool, func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[prometheus_model.Fingerprint][]string, error)
86+
groupsFn func(context.Context, func(*dispatch.Route) bool, func(*alert.Alert, time.Time) bool) (dispatch.AlertGroups, map[prometheus_model.Fingerprint][]string, error)
8787
groupMutedFunc func(routeID, groupKey string) ([]string, bool)
88-
getAlertStatusFn func(prometheus_model.Fingerprint) types.AlertStatus
8988
setAlertStatusFn func(ctx context.Context, labels prometheus_model.LabelSet)
9089
)
9190

9291
// NewAPI returns a new Alertmanager API v2.
9392
func NewAPI(
9493
alerts provider.Alerts,
9594
gf groupsFn,
96-
asf getAlertStatusFn,
9795
gmf groupMutedFunc,
9896
silences *silence.Silences,
9997
peer cluster.ClusterPeer,
@@ -102,7 +100,6 @@ func NewAPI(
102100
) (*API, error) {
103101
api := API{
104102
alerts: alerts,
105-
getAlertStatus: asf,
106103
alertGroups: gf,
107104
groupMutedFunc: gmf,
108105
peer: peer,
@@ -281,7 +278,8 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
281278
alerts := api.alerts.GetPending()
282279
defer alerts.Close()
283280

284-
alertFilter := api.alertFilter(matchers, *params.Silenced, *params.Inhibited, *params.Active)
281+
tempMarker := marker.NewAlertMarker()
282+
alertFilter := api.alertFilter(matchers, *params.Silenced, *params.Inhibited, *params.Active, tempMarker)
285283
now := time.Now()
286284

287285
api.mtx.RLock()
@@ -308,7 +306,7 @@ func (api *API) getAlertsHandler(params alert_ops.GetAlertsParams) middleware.Re
308306
continue
309307
}
310308

311-
openAlert := AlertToOpenAPIAlert(alert, api.getAlertStatus(alert.Fingerprint()), receivers, nil)
309+
openAlert := AlertToOpenAPIAlert(alert, tempMarker.Status(alert.Fingerprint()), receivers, nil)
312310

313311
res = append(res, openAlert)
314312
}
@@ -365,7 +363,7 @@ func (api *API) postAlertsHandler(params alert_ops.PostAlertsParams) middleware.
365363

366364
// Make a best effort to insert all alerts that are valid.
367365
var (
368-
validAlerts = make([]*types.Alert, 0, len(alerts))
366+
validAlerts = make([]*alert.Alert, 0, len(alerts))
369367
validationErrs error
370368
)
371369
for _, a := range alerts {
@@ -432,7 +430,7 @@ func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams
432430
}
433431
}(receiverFilter)
434432

435-
af := api.alertFilter(matchers, *params.Silenced, *params.Inhibited, *params.Active)
433+
af := api.alertFilter(matchers, *params.Silenced, *params.Inhibited, *params.Active, marker.NewAlertMarker())
436434
alertGroups, allReceivers, err := api.alertGroups(ctx, rf, af)
437435
if err != nil {
438436
message := "Failed to get alert groups"
@@ -459,7 +457,7 @@ func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams
459457
for _, alert := range alertGroup.Alerts {
460458
fp := alert.Fingerprint()
461459
receivers := allReceivers[fp]
462-
status := api.getAlertStatus(fp)
460+
status := alertGroup.AlertStatuses[fp]
463461
apiAlert := AlertToOpenAPIAlert(alert, status, receivers, mutedBy)
464462
ag.Alerts = append(ag.Alerts, apiAlert)
465463
}
@@ -469,8 +467,8 @@ func (api *API) getAlertGroupsHandler(params alertgroup_ops.GetAlertGroupsParams
469467
return alertgroup_ops.NewGetAlertGroupsOK().WithPayload(res)
470468
}
471469

472-
func (api *API) alertFilter(matchers []*labels.Matcher, silenced, inhibited, active bool) func(a *types.Alert, now time.Time) bool {
473-
return func(a *types.Alert, now time.Time) bool {
470+
func (api *API) alertFilter(matchers []*labels.Matcher, silenced, inhibited, active bool, m marker.AlertMarker) func(a *alert.Alert, now time.Time) bool {
471+
return func(a *alert.Alert, now time.Time) bool {
474472
ctx, span := tracer.Start(context.Background(), "alertFilter")
475473
defer span.End()
476474

@@ -479,12 +477,14 @@ func (api *API) alertFilter(matchers []*labels.Matcher, silenced, inhibited, act
479477
}
480478

481479
// Set alert's current status based on its label set.
480+
// The inhibitor and silencer write to m via the context.
481+
ctx = marker.WithAlertMarker(ctx, m)
482482
api.setAlertStatus(ctx, a.Labels)
483483

484484
// Get alert's current status after seeing if it is suppressed.
485-
status := api.getAlertStatus(a.Fingerprint())
485+
status := m.Status(a.Fingerprint())
486486

487-
if !active && status.State == types.AlertStateActive {
487+
if !active && status.State == alert.AlertStateActive {
488488
return false
489489
}
490490

api/v2/api_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@ import (
3232
"github.com/stretchr/testify/require"
3333
"google.golang.org/protobuf/types/known/timestamppb"
3434

35+
"github.com/prometheus/alertmanager/alert"
3536
open_api_models "github.com/prometheus/alertmanager/api/v2/models"
3637
general_ops "github.com/prometheus/alertmanager/api/v2/restapi/operations/general"
3738
receiver_ops "github.com/prometheus/alertmanager/api/v2/restapi/operations/receiver"
@@ -40,7 +41,6 @@ import (
4041
"github.com/prometheus/alertmanager/pkg/labels"
4142
"github.com/prometheus/alertmanager/silence"
4243
"github.com/prometheus/alertmanager/silence/silencepb"
43-
"github.com/prometheus/alertmanager/types"
4444
)
4545

4646
// If api.peers == nil, Alertmanager cluster feature is disabled. Make sure to
@@ -494,15 +494,15 @@ func TestAlertToOpenAPIAlert(t *testing.T) {
494494
fp = "0223b772b51c29e1"
495495
receivers = []string{"receiver1", "receiver2"}
496496

497-
alert = &types.Alert{
497+
a = &alert.Alert{
498498
Alert: model.Alert{
499499
Labels: model.LabelSet{"severity": "critical", "alertname": "alert1"},
500500
StartsAt: start,
501501
},
502502
UpdatedAt: updated,
503503
}
504504
)
505-
openAPIAlert := AlertToOpenAPIAlert(alert, types.AlertStatus{State: types.AlertStateActive}, receivers, nil)
505+
openAPIAlert := AlertToOpenAPIAlert(a, alert.AlertStatus{State: alert.AlertStateActive}, receivers, nil)
506506
require.Equal(t, &open_api_models.GettableAlert{
507507
Annotations: open_api_models.LabelSet{},
508508
Alert: open_api_models.Alert{

api/v2/compat.go

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,10 @@ import (
2222
prometheus_model "github.com/prometheus/common/model"
2323
"google.golang.org/protobuf/types/known/timestamppb"
2424

25+
"github.com/prometheus/alertmanager/alert"
2526
open_api_models "github.com/prometheus/alertmanager/api/v2/models"
2627
"github.com/prometheus/alertmanager/silence"
2728
"github.com/prometheus/alertmanager/silence/silencepb"
28-
"github.com/prometheus/alertmanager/types"
2929
)
3030

3131
// GettableSilenceFromProto converts *silencepb.Silence to open_api_models.GettableSilence.
@@ -139,7 +139,7 @@ func PostableSilenceToProto(s *open_api_models.PostableSilence) (*silencepb.Sile
139139
}
140140

141141
// AlertToOpenAPIAlert converts internal alerts, alert types, and receivers to *open_api_models.GettableAlert.
142-
func AlertToOpenAPIAlert(alert *types.Alert, status types.AlertStatus, receivers, mutedBy []string) *open_api_models.GettableAlert {
142+
func AlertToOpenAPIAlert(alert *alert.Alert, status alert.AlertStatus, receivers, mutedBy []string) *open_api_models.GettableAlert {
143143
startsAt := strfmt.DateTime(alert.StartsAt)
144144
updatedAt := strfmt.DateTime(alert.UpdatedAt)
145145
endsAt := strfmt.DateTime(alert.EndsAt)
@@ -191,14 +191,14 @@ func AlertToOpenAPIAlert(alert *types.Alert, status types.AlertStatus, receivers
191191
return aa
192192
}
193193

194-
// OpenAPIAlertsToAlerts converts open_api_models.PostableAlerts to []*types.Alert.
195-
func OpenAPIAlertsToAlerts(ctx context.Context, apiAlerts open_api_models.PostableAlerts) []*types.Alert {
194+
// OpenAPIAlertsToAlerts converts open_api_models.PostableAlerts to []*alert.Alert.
195+
func OpenAPIAlertsToAlerts(ctx context.Context, apiAlerts open_api_models.PostableAlerts) []*alert.Alert {
196196
_, span := tracer.Start(ctx, "OpenAPIAlertsToAlerts")
197197
defer span.End()
198198

199-
alerts := []*types.Alert{}
199+
alerts := []*alert.Alert{}
200200
for _, apiAlert := range apiAlerts {
201-
alert := types.Alert{
201+
alert := alert.Alert{
202202
Alert: prometheus_model.Alert{
203203
Labels: APILabelSetToModelLabelSet(apiAlert.Labels),
204204
Annotations: APILabelSetToModelLabelSet(apiAlert.Annotations),

cmd/alertmanager/main.go

Lines changed: 7 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -46,13 +46,15 @@ import (
4646
webflag "github.com/prometheus/exporter-toolkit/web/kingpinflag"
4747
"go.uber.org/automaxprocs/maxprocs"
4848

49+
"github.com/prometheus/alertmanager/alert"
4950
"github.com/prometheus/alertmanager/api"
5051
"github.com/prometheus/alertmanager/cluster"
5152
"github.com/prometheus/alertmanager/config"
5253
"github.com/prometheus/alertmanager/config/receiver"
5354
"github.com/prometheus/alertmanager/dispatch"
5455
"github.com/prometheus/alertmanager/featurecontrol"
5556
"github.com/prometheus/alertmanager/inhibit"
57+
"github.com/prometheus/alertmanager/marker"
5658
"github.com/prometheus/alertmanager/matcher/compat"
5759
"github.com/prometheus/alertmanager/nflog"
5860
"github.com/prometheus/alertmanager/notify"
@@ -61,7 +63,6 @@ import (
6163
"github.com/prometheus/alertmanager/template"
6264
"github.com/prometheus/alertmanager/timeinterval"
6365
"github.com/prometheus/alertmanager/tracing"
64-
"github.com/prometheus/alertmanager/types"
6566
"github.com/prometheus/alertmanager/ui"
6667
)
6768

@@ -291,7 +292,7 @@ func run() int {
291292
notificationLog.Maintenance(*maintenanceInterval, filepath.Join(*dataDir, "nflog"), stopc, nil)
292293
})
293294

294-
marker := types.NewMarker(prometheus.DefaultRegisterer)
295+
marker := marker.NewGroupMarker()
295296

296297
silenceOpts := silence.Options{
297298
SnapshotFile: filepath.Join(*dataDir, "silences"),
@@ -324,7 +325,7 @@ func run() int {
324325
wg.Wait()
325326
}()
326327

327-
silencer := silence.NewSilencer(silences, marker, logger)
328+
silencer := silence.NewSilencer(silences, logger)
328329

329330
// Peer state listeners have been registered, now we can join and get the initial state.
330331
if peer != nil {
@@ -347,7 +348,6 @@ func run() int {
347348

348349
alerts, err := mem.NewAlerts(
349350
context.Background(),
350-
marker,
351351
*alertGCInterval,
352352
*perAlertNameLimit,
353353
silencer,
@@ -366,7 +366,7 @@ func run() int {
366366
disp.Load().Stop()
367367
}()
368368

369-
groupFn := func(ctx context.Context, routeFilter func(*dispatch.Route) bool, alertFilter func(*types.Alert, time.Time) bool) (dispatch.AlertGroups, map[model.Fingerprint][]string, error) {
369+
groupFn := func(ctx context.Context, routeFilter func(*dispatch.Route) bool, alertFilter func(*alert.Alert, time.Time) bool) (dispatch.AlertGroups, map[model.Fingerprint][]string, error) {
370370
return disp.Load().Groups(ctx, routeFilter, alertFilter)
371371
}
372372

@@ -381,7 +381,6 @@ func run() int {
381381
api, err := api.New(api.Options{
382382
Alerts: alerts,
383383
Silences: silences,
384-
AlertStatusFunc: marker.Status,
385384
GroupMutedFunc: marker.Muted,
386385
Peer: clusterPeer,
387386
Timeout: *httpTimeout,
@@ -421,7 +420,7 @@ func run() int {
421420
tmpl *template.Template
422421
)
423422

424-
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer)
423+
dispMetrics := dispatch.NewDispatcherMetrics(false, prometheus.DefaultRegisterer, ff)
425424
pipelineBuilder := notify.NewPipelineBuilder(prometheus.DefaultRegisterer, ff)
426425
configLogger := logger.With("component", "configuration")
427426
configCoordinator := config.NewCoordinator(
@@ -476,7 +475,7 @@ func run() int {
476475
inhibitor.Load().Stop()
477476
disp.Load().Stop()
478477

479-
newInhibitor := inhibit.NewInhibitor(alerts, conf.InhibitRules, marker, logger)
478+
newInhibitor := inhibit.NewInhibitor(alerts, conf.InhibitRules, logger)
480479
inhibitor.Store(newInhibitor)
481480

482481
// An interface value that holds a nil concrete value is non-nil.

0 commit comments

Comments
 (0)