Part 32: Velero for Backup
"Backups that have not been restored are hopeful tarballs. Restored once a week, they are insurance."
Why
Velero is the standard cluster-aware backup tool for Kubernetes. It backs up:
- API resources (Deployments, Services, ConfigMaps, Secrets, CRDs) by querying the API server
- PVC contents via either CSI snapshots (if the CSI driver supports them) or
restic(which copies file-by-file) - Whole namespaces or label-selected subsets
The thesis: K8s.Dsl ships VeleroHelmReleaseContributor configured to back up to the in-cluster MinIO. Each cluster gets a daily backup of every namespace. A scheduled restore-test job restores the latest backup into a throwaway namespace and verifies the workloads come up healthy. Dogfood loop closure.
The shape
[Injectable(ServiceLifetime.Singleton)]
public sealed class VeleroHelmReleaseContributor : IHelmReleaseContributor
{
public string TargetCluster => "*";
public void Contribute(KubernetesBundle bundle)
{
bundle.HelmReleases.Add(new HelmReleaseSpec
{
Name = "velero",
Namespace = "velero",
Chart = "vmware-tanzu/velero",
Version = "8.2.0",
RepoUrl = "https://vmware-tanzu.github.io/helm-charts",
CreateNamespace = true,
Wait = true,
Values = new()
{
["initContainers"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "velero-plugin-for-aws",
["image"] = "velero/velero-plugin-for-aws:v1.11.0",
["volumeMounts"] = new[]
{
new Dictionary<string, object?>
{
["mountPath"] = "/target",
["name"] = "plugins"
}
}
}
},
["configuration"] = new Dictionary<string, object?>
{
["backupStorageLocation"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "default",
["provider"] = "aws",
["bucket"] = "velero-backups",
["config"] = new Dictionary<string, object?>
{
["region"] = "us-east-1",
["s3ForcePathStyle"] = "true",
["s3Url"] = "https://velero-minio-hl.velero.svc.cluster.local:9000",
["insecureSkipTLSVerify"] = "true"
}
}
},
["volumeSnapshotLocation"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "default",
["provider"] = "aws",
["config"] = new Dictionary<string, object?> { ["region"] = "us-east-1" }
}
},
["features"] = "EnableCSI" // use CSI snapshots when available, restic otherwise
},
["credentials"] = new Dictionary<string, object?>
{
["existingSecret"] = "velero-minio-credentials"
},
["deployNodeAgent"] = true // for restic-backed PVC contents
}
});
}
}[Injectable(ServiceLifetime.Singleton)]
public sealed class VeleroHelmReleaseContributor : IHelmReleaseContributor
{
public string TargetCluster => "*";
public void Contribute(KubernetesBundle bundle)
{
bundle.HelmReleases.Add(new HelmReleaseSpec
{
Name = "velero",
Namespace = "velero",
Chart = "vmware-tanzu/velero",
Version = "8.2.0",
RepoUrl = "https://vmware-tanzu.github.io/helm-charts",
CreateNamespace = true,
Wait = true,
Values = new()
{
["initContainers"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "velero-plugin-for-aws",
["image"] = "velero/velero-plugin-for-aws:v1.11.0",
["volumeMounts"] = new[]
{
new Dictionary<string, object?>
{
["mountPath"] = "/target",
["name"] = "plugins"
}
}
}
},
["configuration"] = new Dictionary<string, object?>
{
["backupStorageLocation"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "default",
["provider"] = "aws",
["bucket"] = "velero-backups",
["config"] = new Dictionary<string, object?>
{
["region"] = "us-east-1",
["s3ForcePathStyle"] = "true",
["s3Url"] = "https://velero-minio-hl.velero.svc.cluster.local:9000",
["insecureSkipTLSVerify"] = "true"
}
}
},
["volumeSnapshotLocation"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "default",
["provider"] = "aws",
["config"] = new Dictionary<string, object?> { ["region"] = "us-east-1" }
}
},
["features"] = "EnableCSI" // use CSI snapshots when available, restic otherwise
},
["credentials"] = new Dictionary<string, object?>
{
["existingSecret"] = "velero-minio-credentials"
},
["deployNodeAgent"] = true // for restic-backed PVC contents
}
});
}
}The chart installs:
- The Velero controller (one Deployment)
- The node agent (a DaemonSet, used for restic-backed PVC backups)
- The AWS plugin init container (which works against any S3-compatible endpoint, including MinIO)
- A
BackupStorageLocationpointing at thevelero-backupsbucket in the in-cluster MinIO tenant
The Velero MinIO tenant is a separate Tenant CRD instance from Part 30, in the velero namespace, with a single bucket velero-backups. The credentials live in the velero-minio-credentials Secret materialized from ISecretStore via the build-time path from Part 10.
Scheduled backups per namespace
public void Contribute(KubernetesBundle bundle)
{
var namespacesToBackup = new[] { "acme-prod", "gitlab", "monitoring", "argocd" };
foreach (var ns in namespacesToBackup)
{
bundle.CrdInstances.Add(new RawManifest
{
ApiVersion = "velero.io/v1",
Kind = "Schedule",
Metadata = new() { Name = $"daily-{ns}", Namespace = "velero" },
Spec = new Dictionary<string, object?>
{
["schedule"] = "0 2 * * *",
["template"] = new Dictionary<string, object?>
{
["includedNamespaces"] = new[] { ns },
["storageLocation"] = "default",
["ttl"] = "720h", // 30 days
["defaultVolumesToFsBackup"] = true // restic for PVC contents
}
}
});
}
}public void Contribute(KubernetesBundle bundle)
{
var namespacesToBackup = new[] { "acme-prod", "gitlab", "monitoring", "argocd" };
foreach (var ns in namespacesToBackup)
{
bundle.CrdInstances.Add(new RawManifest
{
ApiVersion = "velero.io/v1",
Kind = "Schedule",
Metadata = new() { Name = $"daily-{ns}", Namespace = "velero" },
Spec = new Dictionary<string, object?>
{
["schedule"] = "0 2 * * *",
["template"] = new Dictionary<string, object?>
{
["includedNamespaces"] = new[] { ns },
["storageLocation"] = "default",
["ttl"] = "720h", // 30 days
["defaultVolumesToFsBackup"] = true // restic for PVC contents
}
}
});
}
}Four schedules. One per critical namespace. Backups land in MinIO. TTL is 30 days. PVC contents are backed up via restic.
The restore-test job
public void Contribute(KubernetesBundle bundle)
{
bundle.CrdInstances.Add(new RawManifest
{
ApiVersion = "batch/v1",
Kind = "CronJob",
Metadata = new() { Name = "restore-test", Namespace = "velero" },
Spec = new Dictionary<string, object?>
{
["schedule"] = "0 4 * * 1", // every Monday at 04:00
["jobTemplate"] = new Dictionary<string, object?>
{
["spec"] = new Dictionary<string, object?>
{
["template"] = new Dictionary<string, object?>
{
["spec"] = new Dictionary<string, object?>
{
["restartPolicy"] = "OnFailure",
["serviceAccountName"] = "velero",
["containers"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "restore-test",
["image"] = "velero/velero:v1.15.0",
["command"] = new[]
{
"/bin/sh", "-c", """
set -eux
# Find the most recent successful backup of acme-prod
LATEST=$(velero backup get -o name | grep daily-acme-prod | sort | tail -1)
# Restore it into a throwaway namespace
velero restore create test-restore-$(date +%Y%m%d) \
--from-backup $LATEST \
--namespace-mappings acme-prod:acme-restore-test \
--wait
# Verify some pod comes up healthy
kubectl wait --for=condition=Ready pod -l app=acme-api -n acme-restore-test --timeout=300s
# Clean up
velero restore delete test-restore-$(date +%Y%m%d) --confirm
kubectl delete namespace acme-restore-test
"""
}
}
}
}
}
}
}
}
});
}public void Contribute(KubernetesBundle bundle)
{
bundle.CrdInstances.Add(new RawManifest
{
ApiVersion = "batch/v1",
Kind = "CronJob",
Metadata = new() { Name = "restore-test", Namespace = "velero" },
Spec = new Dictionary<string, object?>
{
["schedule"] = "0 4 * * 1", // every Monday at 04:00
["jobTemplate"] = new Dictionary<string, object?>
{
["spec"] = new Dictionary<string, object?>
{
["template"] = new Dictionary<string, object?>
{
["spec"] = new Dictionary<string, object?>
{
["restartPolicy"] = "OnFailure",
["serviceAccountName"] = "velero",
["containers"] = new[]
{
new Dictionary<string, object?>
{
["name"] = "restore-test",
["image"] = "velero/velero:v1.15.0",
["command"] = new[]
{
"/bin/sh", "-c", """
set -eux
# Find the most recent successful backup of acme-prod
LATEST=$(velero backup get -o name | grep daily-acme-prod | sort | tail -1)
# Restore it into a throwaway namespace
velero restore create test-restore-$(date +%Y%m%d) \
--from-backup $LATEST \
--namespace-mappings acme-prod:acme-restore-test \
--wait
# Verify some pod comes up healthy
kubectl wait --for=condition=Ready pod -l app=acme-api -n acme-restore-test --timeout=300s
# Clean up
velero restore delete test-restore-$(date +%Y%m%d) --confirm
kubectl delete namespace acme-restore-test
"""
}
}
}
}
}
}
}
}
});
}Every Monday at 04:00, a CronJob runs that:
- Finds the latest successful backup of
acme-prod - Restores it into a fresh throwaway namespace
acme-restore-test - Waits for the workloads to come up healthy
- Cleans up
If any step fails, the CronJob fails and Alertmanager pages the user. This is dogfood loop #4 from homelab-docker Part 06, applied to k8s: the backup that has not been restored is not a backup; the backup that is restored once a week is insurance.
What this gives you that hand-rolled etcdctl snapshot doesn't
A etcdctl snapshot save backs up etcd. It does not back up Postgres data, MinIO objects, or any state outside etcd. A real recovery from a etcdctl snapshot produces a cluster with the same API resources but no actual data — every PVC is empty, every secret value is gone (because the secret values were stored in volumes, not in etcd).
Velero with restic gives you, for the same surface area:
- API resource backup (Deployments, Services, ConfigMaps, Secrets)
- PVC content backup via restic
- Per-namespace scheduled backups
- Restoring into a different namespace (the
--namespace-mappingsflag) - CSI snapshot integration when the storage class supports it
- A restore-test that actually verifies the backups are recoverable
The bargain pays back the first time the restore test fails because a CRD field name changed in the new chart version, and you fix it before you actually need a real restore.