Skip to content

feat: add hard-limited presets metric #18008

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 11 commits into from
May 26, 2025
Merged
46 changes: 46 additions & 0 deletions enterprise/coderd/prebuilds/metricscollector.go
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ const (
MetricDesiredGauge = namespace + "desired"
MetricRunningGauge = namespace + "running"
MetricEligibleGauge = namespace + "eligible"
MetricPresetHardLimitedGauge = namespace + "preset_hard_limited"
MetricLastUpdatedGauge = namespace + "metrics_last_updated"
)

Expand Down Expand Up @@ -82,6 +83,12 @@ var (
labels,
nil,
)
presetHardLimitedDesc = prometheus.NewDesc(
MetricPresetHardLimitedGauge,
"Indicates whether a given preset has reached the hard failure limit (1 = hard-limited). Metric is omitted otherwise.",
labels,
nil,
)
lastUpdateDesc = prometheus.NewDesc(
MetricLastUpdatedGauge,
"The unix timestamp when the metrics related to prebuilt workspaces were last updated; these metrics are cached.",
Expand All @@ -104,17 +111,22 @@ type MetricsCollector struct {

replacementsCounter map[replacementKey]float64
replacementsCounterMu sync.Mutex

isPresetHardLimited map[hardLimitedPresetKey]bool
isPresetHardLimitedMu sync.Mutex
}

var _ prometheus.Collector = new(MetricsCollector)

func NewMetricsCollector(db database.Store, logger slog.Logger, snapshotter prebuilds.StateSnapshotter) *MetricsCollector {
log := logger.Named("prebuilds_metrics_collector")

return &MetricsCollector{
database: db,
logger: log,
snapshotter: snapshotter,
replacementsCounter: make(map[replacementKey]float64),
isPresetHardLimited: make(map[hardLimitedPresetKey]bool),
}
}

Expand All @@ -126,6 +138,7 @@ func (*MetricsCollector) Describe(descCh chan<- *prometheus.Desc) {
descCh <- desiredPrebuildsDesc
descCh <- runningPrebuildsDesc
descCh <- eligiblePrebuildsDesc
descCh <- presetHardLimitedDesc
descCh <- lastUpdateDesc
}

Expand Down Expand Up @@ -173,6 +186,17 @@ func (mc *MetricsCollector) Collect(metricsCh chan<- prometheus.Metric) {
metricsCh <- prometheus.MustNewConstMetric(eligiblePrebuildsDesc, prometheus.GaugeValue, float64(state.Eligible), preset.TemplateName, preset.Name, preset.OrganizationName)
}

mc.isPresetHardLimitedMu.Lock()
for key, isHardLimited := range mc.isPresetHardLimited {
var val float64
if isHardLimited {
val = 1
}

metricsCh <- prometheus.MustNewConstMetric(presetHardLimitedDesc, prometheus.GaugeValue, val, key.templateName, key.presetName, key.orgName)
}
mc.isPresetHardLimitedMu.Unlock()

metricsCh <- prometheus.MustNewConstMetric(lastUpdateDesc, prometheus.GaugeValue, float64(currentState.createdAt.Unix()))
}

Expand Down Expand Up @@ -247,3 +271,25 @@ func (mc *MetricsCollector) trackResourceReplacement(orgName, templateName, pres
// cause an issue (or indeed if either would), so we just track the replacement.
mc.replacementsCounter[key]++
}

type hardLimitedPresetKey struct {
orgName, templateName, presetName string
}

func (k hardLimitedPresetKey) String() string {
return fmt.Sprintf("%s:%s:%s", k.orgName, k.templateName, k.presetName)
}

// nolint:revive // isHardLimited determines if the preset should be reported as hard-limited in Prometheus.
func (mc *MetricsCollector) trackHardLimitedStatus(orgName, templateName, presetName string, isHardLimited bool) {
mc.isPresetHardLimitedMu.Lock()
defer mc.isPresetHardLimitedMu.Unlock()

key := hardLimitedPresetKey{orgName: orgName, templateName: templateName, presetName: presetName}

if isHardLimited {
mc.isPresetHardLimited[key] = true
} else {
delete(mc.isPresetHardLimited, key)
}
}
29 changes: 19 additions & 10 deletions enterprise/coderd/prebuilds/reconcile.go
Original file line number Diff line number Diff line change
Expand Up @@ -361,17 +361,22 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
slog.F("preset_name", ps.Preset.Name),
)

// If the preset was previously hard-limited, log it and exit early.
if ps.Preset.PrebuildStatus == database.PrebuildStatusHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
return nil
}
// Report a preset as hard-limited only if all the following conditions are met:
// - The preset is marked as hard-limited
// - The preset is using the active version of its template, and the template has not been deleted
//
// The second condition is important because a hard-limited preset that has become outdated is no longer relevant.
// Its associated prebuilt workspaces were likely deleted, and it's not meaningful to continue reporting it
// as hard-limited to the admin.
reportAsHardLimited := ps.IsHardLimited && ps.Preset.UsingActiveVersion && !ps.Preset.Deleted
c.metrics.trackHardLimitedStatus(ps.Preset.OrganizationName, ps.Preset.TemplateName, ps.Preset.Name, reportAsHardLimited)

// If the preset reached the hard failure limit for the first time during this iteration:
// - Mark it as hard-limited in the database
// - Send notifications to template admins
if ps.IsHardLimited {
logger.Warn(ctx, "skipping hard limited preset")
// - Continue execution, we disallow only creation operation for hard-limited presets. Deletion is allowed.
if ps.Preset.PrebuildStatus != database.PrebuildStatusHardLimited && ps.IsHardLimited {
logger.Warn(ctx, "preset is hard limited, notifying template admins")

err := c.store.UpdatePresetPrebuildStatus(ctx, database.UpdatePresetPrebuildStatusParams{
Status: database.PrebuildStatusHardLimited,
Expand All @@ -384,10 +389,7 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
err = c.notifyPrebuildFailureLimitReached(ctx, ps)
if err != nil {
logger.Error(ctx, "failed to notify that number of prebuild failures reached the limit", slog.Error(err))
return nil
}

return nil
}

state := ps.CalculateState()
Expand Down Expand Up @@ -452,6 +454,13 @@ func (c *StoreReconciler) ReconcilePreset(ctx context.Context, ps prebuilds.Pres
actions.Create = desired
}

// If preset is hard-limited, and it's a create operation, log it and exit early.
// Creation operation is disallowed for hard-limited preset.
if ps.IsHardLimited && actions.Create > 0 {
logger.Warn(ctx, "skipping hard limited preset for create operation")
return nil
}

var multiErr multierror.Error

for range actions.Create {
Expand Down
Loading
Loading