Skip to main content
Welcome. This site supports keyboard navigation and screen readers. Press ? at any time for keyboard shortcuts. Press [ to focus the sidebar, ] to focus the content. High-contrast themes are available via the toolbar.
serard@dev00:~/cv

Part 44: The Observability Stack — Prometheus, Grafana, Loki

"You cannot operate what you cannot observe. You cannot observe what you cannot generate from one source of truth."


Why

DevLab has eight services that emit metrics. Four of them (GitLab, MinIO, Postgres, Traefik) emit Prometheus-format metrics directly. The rest expose operational signals via logs (which Loki ingests) or via health-check endpoints (which Prometheus scrapes via blackbox_exporter). On top of that, every HomeLab pipeline run publishes events into IHomeLabEventBus (Part 09), and we want those events surfaced in Grafana too.

Putting all of this together by hand is the project that consumes a homelab's lifetime. Every time you add a service, you have to:

  1. Update the Prometheus scrape config
  2. Update the Alertmanager rules
  3. Add a Grafana dashboard
  4. Configure Loki to ingest the new logs
  5. Wire the alerts to a notification channel (Slack? PagerDuty? email?)

The thesis of this part is: the observability stack is its own compose stack on the obs VM. Prometheus scrape configs, alert rules, Grafana dashboards, and Loki sources are all generated from Ops.Observability declarations attached to other compose contributors. Adding a metric to a service is one attribute on the service's declaration; the rest is automatic.


The shape

The observability stack contributors:

[Injectable(ServiceLifetime.Singleton)]
public sealed class PrometheusComposeContributor : IComposeFileContributor
{
    public string TargetVm => "obs";

    public void Contribute(ComposeFile compose)
    {
        compose.Services["prometheus"] = new ComposeService
        {
            Image = "prom/prometheus:v2.55.0",
            Restart = "always",
            Hostname = "prometheus",
            Volumes = new()
            {
                "./prometheus/prometheus.yml:/etc/prometheus/prometheus.yml:ro",
                "./prometheus/alerts:/etc/prometheus/alerts:ro",
                "prometheus_data:/prometheus"
            },
            Networks = new() { "obs-net" },
            Command = "--config.file=/etc/prometheus/prometheus.yml " +
                      "--storage.tsdb.path=/prometheus " +
                      "--storage.tsdb.retention.time=30d " +
                      "--web.enable-lifecycle",
            HealthCheck = new ComposeHealthcheck { /* ... */ },
            Labels = new TraefikLabels()
                .Enable()
                .Router("prometheus", r => r
                    .Rule($"Host(`prometheus.{_config.Acme.Tld}`)")
                    .EntryPoints("websecure")
                    .Middlewares("basic-auth-prometheus", "ip-allowlist-internal")
                    .Tls(certResolver: "default"))
                .Build()
        };

        compose.Volumes["prometheus_data"] ??= new ComposeVolume { Driver = "local" };
        compose.Networks["obs-net"] ??= new ComposeNetwork { Driver = "bridge" };
    }
}

[Injectable(ServiceLifetime.Singleton)]
public sealed class GrafanaComposeContributor : IComposeFileContributor
{
    public string TargetVm => "obs";

    public void Contribute(ComposeFile compose)
    {
        compose.Services["grafana"] = new ComposeService
        {
            Image = "grafana/grafana:11.4.0",
            Restart = "always",
            Hostname = "grafana",
            Environment = new()
            {
                ["GF_SECURITY_ADMIN_PASSWORD__FILE"] = "/run/secrets/grafana_admin_password",
                ["GF_AUTH_ANONYMOUS_ENABLED"]        = "false",
                ["GF_INSTALL_PLUGINS"]               = "grafana-piechart-panel"
            },
            Volumes = new()
            {
                "grafana_data:/var/lib/grafana",
                "./grafana/provisioning:/etc/grafana/provisioning:ro",
                "./grafana/dashboards:/var/lib/grafana/dashboards:ro"
            },
            Networks = new() { "obs-net" },
            Secrets = new() { "grafana_admin_password" },
            Labels = new TraefikLabels()
                .Enable()
                .Router("grafana", r => r
                    .Rule($"Host(`grafana.{_config.Acme.Tld}`)")
                    .EntryPoints("websecure")
                    .Middlewares("basic-auth-grafana")
                    .Tls(certResolver: "default"))
                .Build()
        };

        compose.Volumes["grafana_data"] ??= new ComposeVolume { Driver = "local" };
        compose.Secrets["grafana_admin_password"] ??= new ComposeSecret { File = "./secrets/grafana_admin_password" };
    }
}

[Injectable(ServiceLifetime.Singleton)]
public sealed class LokiComposeContributor : IComposeFileContributor
{
    public string TargetVm => "obs";

    public void Contribute(ComposeFile compose)
    {
        compose.Services["loki"] = new ComposeService
        {
            Image = "grafana/loki:3.2.0",
            Restart = "always",
            Hostname = "loki",
            Volumes = new()
            {
                "./loki/loki-config.yaml:/etc/loki/loki-config.yaml:ro",
                "loki_data:/loki"
            },
            Networks = new() { "obs-net" },
            Command = "-config.file=/etc/loki/loki-config.yaml"
        };
        compose.Volumes["loki_data"] ??= new ComposeVolume { Driver = "local" };

        compose.Services["promtail"] = new ComposeService
        {
            Image = "grafana/promtail:3.2.0",
            Restart = "always",
            Volumes = new()
            {
                "./promtail/config.yaml:/etc/promtail/config.yaml:ro",
                "/var/log:/var/log:ro",
                "/var/lib/docker/containers:/var/lib/docker/containers:ro"
            },
            Networks = new() { "obs-net" },
            DependsOn = new() { ["loki"] = new() { Condition = "service_started" } }
        };
    }
}

[Injectable(ServiceLifetime.Singleton)]
public sealed class AlertmanagerComposeContributor : IComposeFileContributor
{
    public string TargetVm => "obs";

    public void Contribute(ComposeFile compose)
    {
        compose.Services["alertmanager"] = new ComposeService
        {
            Image = "prom/alertmanager:v0.27.0",
            Restart = "always",
            Volumes = new() { "./alertmanager/alertmanager.yml:/etc/alertmanager/alertmanager.yml:ro" },
            Networks = new() { "obs-net" },
            Command = "--config.file=/etc/alertmanager/alertmanager.yml"
        };
    }
}

Five services, four contributors, all on the obs VM. The configuration files (prometheus.yml, alerts/*.yml, grafana/dashboards/*.json, loki-config.yaml, alertmanager.yml) are all generated from typed C#.


The Ops.Observability declarations

Other contributors (the ones for GitLab, Postgres, MinIO, etc.) declare their observability concerns on themselves, using Ops.Observability attributes:

[Injectable(ServiceLifetime.Singleton)]
public sealed class GitLabComposeContributor : IComposeFileContributor
{
    public string TargetVm => "platform";

    public void Contribute(ComposeFile compose)
    {
        compose.Services["gitlab"] = new ComposeService { /* as before */ };
    }

    public IEnumerable<OpsObservabilityDeclaration> Observability => new OpsObservabilityDeclaration[]
    {
        new OpsHealthCheck("gitlab", Path: "/-/health", IntervalSeconds: 30, TimeoutSeconds: 5),

        new OpsMetricScrapeTarget("gitlab", Endpoint: "http://gitlab:80/-/metrics", IntervalSeconds: 60),

        new OpsAlertRule(
            Name: "GitLabDown",
            Expression: "up{job=\"gitlab\"} == 0",
            For: "2m",
            Severity: "critical",
            Annotations: new() { ["summary"] = "GitLab is down" }),

        new OpsAlertRule(
            Name: "GitLabHighErrorRate",
            Expression: "rate(http_requests_total{job=\"gitlab\",code=~\"5..\"}[5m]) > 0.05",
            For: "5m",
            Severity: "warning",
            Annotations: new() { ["summary"] = "GitLab error rate above 5%" }),

        new OpsDashboard("gitlab",
            Title: "GitLab Service",
            Panels: new[]
            {
                new GrafanaPanel("Request rate",   "rate(http_requests_total{job=\"gitlab\"}[5m])"),
                new GrafanaPanel("Error rate",     "rate(http_requests_total{job=\"gitlab\",code=~\"5..\"}[5m])"),
                new GrafanaPanel("p95 latency",    "histogram_quantile(0.95, rate(http_request_duration_seconds_bucket{job=\"gitlab\"}[5m]))")
            }),

        new OpsLogSource("gitlab", DockerLabelMatch: "compose.service=gitlab")
    };
}

The contributor declares its observability needs as data. The Prometheus / Grafana / Alertmanager / Loki configuration files are then generated by walking every contributor's Observability collection and emitting the right shape.


The generators

[Injectable(ServiceLifetime.Singleton)]
public sealed class PrometheusConfigGenerator
{
    public string Generate(IEnumerable<IComposeFileContributor> contributors)
    {
        var scrapeTargets = contributors
            .SelectMany(c => c.Observability ?? Array.Empty<OpsObservabilityDeclaration>())
            .OfType<OpsMetricScrapeTarget>()
            .ToList();

        var sb = new StringBuilder();
        sb.AppendLine("global:");
        sb.AppendLine("  scrape_interval: 30s");
        sb.AppendLine("  evaluation_interval: 30s");
        sb.AppendLine();
        sb.AppendLine("alerting:");
        sb.AppendLine("  alertmanagers:");
        sb.AppendLine("    - static_configs:");
        sb.AppendLine("        - targets: ['alertmanager:9093']");
        sb.AppendLine();
        sb.AppendLine("rule_files:");
        sb.AppendLine("  - /etc/prometheus/alerts/*.yml");
        sb.AppendLine();
        sb.AppendLine("scrape_configs:");

        foreach (var target in scrapeTargets)
        {
            sb.AppendLine($"  - job_name: '{target.JobName}'");
            sb.AppendLine($"    scrape_interval: {target.IntervalSeconds}s");
            sb.AppendLine($"    static_configs:");
            sb.AppendLine($"      - targets: ['{target.Endpoint.Replace("http://", "")}']");
        }

        return sb.ToString();
    }
}

[Injectable(ServiceLifetime.Singleton)]
public sealed class PrometheusAlertRuleGenerator
{
    public string Generate(IEnumerable<IComposeFileContributor> contributors)
    {
        var rules = contributors
            .SelectMany(c => c.Observability ?? Array.Empty<OpsObservabilityDeclaration>())
            .OfType<OpsAlertRule>()
            .GroupBy(r => r.Severity);

        var sb = new StringBuilder();
        sb.AppendLine("groups:");
        foreach (var group in rules)
        {
            sb.AppendLine($"  - name: {group.Key}");
            sb.AppendLine($"    rules:");
            foreach (var rule in group)
            {
                sb.AppendLine($"      - alert: {rule.Name}");
                sb.AppendLine($"        expr: {rule.Expression}");
                sb.AppendLine($"        for: {rule.For}");
                sb.AppendLine($"        labels:");
                sb.AppendLine($"          severity: {rule.Severity}");
                sb.AppendLine($"        annotations:");
                foreach (var (k, v) in rule.Annotations)
                    sb.AppendLine($"          {k}: \"{v}\"");
            }
        }
        return sb.ToString();
    }
}

[Injectable(ServiceLifetime.Singleton)]
public sealed class GrafanaDashboardGenerator
{
    public Dictionary<string, string> Generate(IEnumerable<IComposeFileContributor> contributors)
    {
        var dashboards = contributors
            .SelectMany(c => c.Observability ?? Array.Empty<OpsObservabilityDeclaration>())
            .OfType<OpsDashboard>();

        var output = new Dictionary<string, string>();
        foreach (var d in dashboards)
        {
            var json = GrafanaDashboardJsonEmitter.Emit(d);   // ~80 lines that build the JSON
            output[$"{d.Slug}.json"] = json;
        }
        return output;
    }
}

Three generators, all walking the same IEnumerable<IComposeFileContributor> and producing different artifacts. The Generate stage of the pipeline writes them to the right paths under out/obs/.


Wiring HomeLab events into observability

IHomeLabEventBus events are also a source of metrics. We bridge them via a small subscriber:

[Injectable(ServiceLifetime.Singleton)]
public sealed class EventBusObservabilityBridge : IHomeLabEventSubscriber
{
    private readonly IPrometheusPushGateway _pushgw;
    private readonly Counter _stageCompleted;
    private readonly Counter _stageFailed;
    private readonly Histogram _stageDuration;

    public EventBusObservabilityBridge(IPrometheusPushGateway pushgw)
    {
        _pushgw = pushgw;
        _stageCompleted = Metrics.CreateCounter("homelab_stage_completed_total", "...", "stage");
        _stageFailed    = Metrics.CreateCounter("homelab_stage_failed_total", "...", "stage");
        _stageDuration  = Metrics.CreateHistogram("homelab_stage_duration_seconds", "...", "stage");
    }

    public void Subscribe(IHomeLabEventBus bus)
    {
        bus.Subscribe<StageCompleted>((e, ct) =>
        {
            _stageCompleted.WithLabels(e.StageName).Inc();
            _stageDuration.WithLabels(e.StageName).Observe(e.Duration.TotalSeconds);
            return _pushgw.PushAsync(ct);
        });
        bus.Subscribe<StageFailed>((e, ct) =>
        {
            _stageFailed.WithLabels(e.StageName).Inc();
            return _pushgw.PushAsync(ct);
        });
    }
}

The bridge pushes counters to Prometheus's pushgateway component (a tiny service that lets short-lived processes register metrics). The pushgateway is itself a compose service in the obs stack. Now every HomeLab pipeline run shows up in Grafana — duration, success rate, per-stage timing. The dogfood loop extends to observability: HomeLab observes its own provisioning runs.


The test

[Fact]
public void prometheus_config_generator_emits_one_scrape_per_metric_target()
{
    var contributors = StandardContributors();
    var config = new PrometheusConfigGenerator().Generate(contributors);

    config.Should().Contain("job_name: 'gitlab'");
    config.Should().Contain("job_name: 'postgres'");
    config.Should().Contain("job_name: 'minio'");
}

[Fact]
public void alert_rule_generator_groups_by_severity()
{
    var contributors = StandardContributors();
    var rules = new PrometheusAlertRuleGenerator().Generate(contributors);

    rules.Should().Contain("- name: critical");
    rules.Should().Contain("- name: warning");
    rules.Should().Contain("alert: GitLabDown");
}

[Fact]
public void dashboard_generator_emits_one_json_per_dashboard()
{
    var contributors = StandardContributors();
    var dashboards = new GrafanaDashboardGenerator().Generate(contributors);

    dashboards.Should().ContainKey("gitlab.json");
    dashboards["gitlab.json"].Should().Contain("\"title\":");
}

What this gives you that bash doesn't

A bash script that "sets up Prometheus + Grafana" is a 200-line prometheus.yml template, a 50-line alertmanager.yml, three Grafana dashboards copied from the internet, and a provisioning/datasources/loki.yaml you forget to update when the Loki version changes.

A typed Ops.Observability declaration set with generators gives you, for the same surface area:

  • One declaration per concern (HealthCheck, MetricScrapeTarget, AlertRule, Dashboard, LogSource) attached to the contributor that owns the service
  • Generated prometheus.yml that always knows about every scrape target
  • Generated alert rules grouped by severity
  • Generated Grafana dashboards from typed panels
  • An event-bus bridge that surfaces HomeLab pipeline runs in Grafana
  • Tests that assert the generators see every contributor

The bargain pays back the first time you add a new service and the dashboard for it appears automatically on the next homelab compose deploy.


⬇ Download