feat: add observability improvements (metrics, audit log, structured logging)
All checks were successful
Check / check (pull_request) Successful in 1m45s
All checks were successful
Check / check (pull_request) Successful in 1m45s
- Add Prometheus metrics package (internal/metrics) with deployment, container health, webhook, HTTP request, and audit counters/histograms - Add audit_log SQLite table via migration 007 - Add AuditEntry model with CRUD operations and query methods - Add audit service (internal/service/audit) for recording user actions - Instrument deploy service with deployment duration, count, and in-flight metrics; container health gauge updates on deploy completion - Instrument webhook service with event counters by app/type/matched - Instrument HTTP middleware with request count, duration, and response size metrics; also log response bytes in structured request logs - Add audit logging to all key handler operations: login/logout, app CRUD, deploy, cancel, rollback, restart/stop/start, webhook receipt, and initial setup - Add GET /api/audit endpoint for querying recent audit entries - Make /metrics endpoint always available (optionally auth-protected) - Add comprehensive tests for metrics, audit model, and audit service - Update existing test infrastructure with metrics and audit dependencies - Update README with Observability section documenting all metrics, audit log, and structured logging
This commit is contained in:
148
internal/metrics/metrics.go
Normal file
148
internal/metrics/metrics.go
Normal file
@@ -0,0 +1,148 @@
|
||||
// Package metrics provides Prometheus metrics for upaas.
|
||||
//
|
||||
//nolint:revive // "metrics" matches the domain; runtime/metrics is rarely imported directly
|
||||
package metrics
|
||||
|
||||
import (
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/prometheus/client_golang/prometheus/promauto"
|
||||
"go.uber.org/fx"
|
||||
)
|
||||
|
||||
// Params contains dependencies for Metrics.
|
||||
type Params struct {
|
||||
fx.In
|
||||
}
|
||||
|
||||
// Metrics holds all Prometheus metrics for the application.
|
||||
type Metrics struct {
|
||||
// Deployment metrics.
|
||||
DeploymentsTotal *prometheus.CounterVec
|
||||
DeploymentDuration *prometheus.HistogramVec
|
||||
DeploymentsInFlight *prometheus.GaugeVec
|
||||
|
||||
// Container health metrics.
|
||||
ContainerHealthy *prometheus.GaugeVec
|
||||
|
||||
// Webhook metrics.
|
||||
WebhookEventsTotal *prometheus.CounterVec
|
||||
|
||||
// HTTP request metrics.
|
||||
HTTPRequestsTotal *prometheus.CounterVec
|
||||
HTTPRequestDuration *prometheus.HistogramVec
|
||||
HTTPResponseSizeBytes *prometheus.HistogramVec
|
||||
|
||||
// Audit log metrics.
|
||||
AuditEventsTotal *prometheus.CounterVec
|
||||
}
|
||||
|
||||
// New creates a new Metrics instance with all Prometheus metrics registered
|
||||
// in the default Prometheus registry.
|
||||
func New(_ fx.Lifecycle, _ Params) (*Metrics, error) {
|
||||
return newMetrics(promauto.With(prometheus.DefaultRegisterer)), nil
|
||||
}
|
||||
|
||||
// NewForTest creates a Metrics instance with a custom registry for test isolation.
|
||||
func NewForTest(reg prometheus.Registerer) *Metrics {
|
||||
return newMetrics(promauto.With(reg))
|
||||
}
|
||||
|
||||
// newMetrics creates a Metrics instance using the given factory.
|
||||
func newMetrics(factory promauto.Factory) *Metrics {
|
||||
return &Metrics{
|
||||
DeploymentsTotal: newDeploymentsTotal(factory),
|
||||
DeploymentDuration: newDeploymentDuration(factory),
|
||||
DeploymentsInFlight: newDeploymentsInFlight(factory),
|
||||
ContainerHealthy: newContainerHealthy(factory),
|
||||
WebhookEventsTotal: newWebhookEventsTotal(factory),
|
||||
HTTPRequestsTotal: newHTTPRequestsTotal(factory),
|
||||
HTTPRequestDuration: newHTTPRequestDuration(factory),
|
||||
HTTPResponseSizeBytes: newHTTPResponseSizeBytes(factory),
|
||||
AuditEventsTotal: newAuditEventsTotal(factory),
|
||||
}
|
||||
}
|
||||
|
||||
func newDeploymentsTotal(f promauto.Factory) *prometheus.CounterVec {
|
||||
return f.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "deployments",
|
||||
Name: "total",
|
||||
Help: "Total number of deployments by app and status.",
|
||||
}, []string{"app", "status"})
|
||||
}
|
||||
|
||||
func newDeploymentDuration(f promauto.Factory) *prometheus.HistogramVec {
|
||||
return f.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "deployments",
|
||||
Name: "duration_seconds",
|
||||
Help: "Duration of deployments in seconds by app and status.",
|
||||
Buckets: []float64{10, 30, 60, 120, 300, 600, 1800},
|
||||
}, []string{"app", "status"})
|
||||
}
|
||||
|
||||
func newDeploymentsInFlight(f promauto.Factory) *prometheus.GaugeVec {
|
||||
return f.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "deployments",
|
||||
Name: "in_flight",
|
||||
Help: "Number of deployments currently in progress by app.",
|
||||
}, []string{"app"})
|
||||
}
|
||||
|
||||
func newContainerHealthy(f promauto.Factory) *prometheus.GaugeVec {
|
||||
return f.NewGaugeVec(prometheus.GaugeOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "container",
|
||||
Name: "healthy",
|
||||
Help: "Whether the app container is healthy (1) or unhealthy (0).",
|
||||
}, []string{"app"})
|
||||
}
|
||||
|
||||
func newWebhookEventsTotal(f promauto.Factory) *prometheus.CounterVec {
|
||||
return f.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "webhook",
|
||||
Name: "events_total",
|
||||
Help: "Total number of webhook events by app, event type, and matched status.",
|
||||
}, []string{"app", "event_type", "matched"})
|
||||
}
|
||||
|
||||
func newHTTPRequestsTotal(f promauto.Factory) *prometheus.CounterVec {
|
||||
return f.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "http",
|
||||
Name: "requests_total",
|
||||
Help: "Total number of HTTP requests by method and status code.",
|
||||
}, []string{"method", "status_code"})
|
||||
}
|
||||
|
||||
func newHTTPRequestDuration(f promauto.Factory) *prometheus.HistogramVec {
|
||||
return f.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "http",
|
||||
Name: "request_duration_seconds",
|
||||
Help: "Duration of HTTP requests in seconds by method.",
|
||||
Buckets: prometheus.DefBuckets,
|
||||
}, []string{"method"})
|
||||
}
|
||||
|
||||
//nolint:mnd // bucket boundaries are domain-specific constants
|
||||
func newHTTPResponseSizeBytes(f promauto.Factory) *prometheus.HistogramVec {
|
||||
return f.NewHistogramVec(prometheus.HistogramOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "http",
|
||||
Name: "response_size_bytes",
|
||||
Help: "Size of HTTP responses in bytes by method.",
|
||||
Buckets: prometheus.ExponentialBuckets(100, 10, 7),
|
||||
}, []string{"method"})
|
||||
}
|
||||
|
||||
func newAuditEventsTotal(f promauto.Factory) *prometheus.CounterVec {
|
||||
return f.NewCounterVec(prometheus.CounterOpts{
|
||||
Namespace: "upaas",
|
||||
Subsystem: "audit",
|
||||
Name: "events_total",
|
||||
Help: "Total number of audit log events by action.",
|
||||
}, []string{"action"})
|
||||
}
|
||||
158
internal/metrics/metrics_test.go
Normal file
158
internal/metrics/metrics_test.go
Normal file
@@ -0,0 +1,158 @@
|
||||
package metrics_test
|
||||
|
||||
import (
|
||||
"testing"
|
||||
|
||||
"github.com/prometheus/client_golang/prometheus"
|
||||
"github.com/stretchr/testify/assert"
|
||||
"github.com/stretchr/testify/require"
|
||||
"go.uber.org/fx"
|
||||
|
||||
"sneak.berlin/go/upaas/internal/metrics"
|
||||
)
|
||||
|
||||
func TestNewForTest(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
require.NotNil(t, m)
|
||||
assert.NotNil(t, m.DeploymentsTotal)
|
||||
assert.NotNil(t, m.DeploymentDuration)
|
||||
assert.NotNil(t, m.DeploymentsInFlight)
|
||||
assert.NotNil(t, m.ContainerHealthy)
|
||||
assert.NotNil(t, m.WebhookEventsTotal)
|
||||
assert.NotNil(t, m.HTTPRequestsTotal)
|
||||
assert.NotNil(t, m.HTTPRequestDuration)
|
||||
assert.NotNil(t, m.HTTPResponseSizeBytes)
|
||||
assert.NotNil(t, m.AuditEventsTotal)
|
||||
}
|
||||
|
||||
func TestNew(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
m, err := metrics.New(fx.Lifecycle(nil), metrics.Params{})
|
||||
require.NoError(t, err)
|
||||
require.NotNil(t, m)
|
||||
}
|
||||
|
||||
func TestDeploymentMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
m.DeploymentsTotal.WithLabelValues("test-app", "success").Inc()
|
||||
m.DeploymentDuration.WithLabelValues("test-app", "success").Observe(42.5)
|
||||
m.DeploymentsInFlight.WithLabelValues("test-app").Set(1)
|
||||
|
||||
families, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
names := make(map[string]bool)
|
||||
|
||||
for _, f := range families {
|
||||
names[f.GetName()] = true
|
||||
}
|
||||
|
||||
assert.True(t, names["upaas_deployments_total"])
|
||||
assert.True(t, names["upaas_deployments_duration_seconds"])
|
||||
assert.True(t, names["upaas_deployments_in_flight"])
|
||||
}
|
||||
|
||||
func TestContainerHealthMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
m.ContainerHealthy.WithLabelValues("my-app").Set(1)
|
||||
|
||||
families, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
found := false
|
||||
|
||||
for _, f := range families {
|
||||
if f.GetName() == "upaas_container_healthy" {
|
||||
found = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
assert.True(t, found)
|
||||
}
|
||||
|
||||
func TestWebhookMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
m.WebhookEventsTotal.WithLabelValues("test-app", "push", "true").Inc()
|
||||
|
||||
families, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
found := false
|
||||
|
||||
for _, f := range families {
|
||||
if f.GetName() == "upaas_webhook_events_total" {
|
||||
found = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
assert.True(t, found)
|
||||
}
|
||||
|
||||
func TestHTTPMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
m.HTTPRequestsTotal.WithLabelValues("GET", "200").Inc()
|
||||
m.HTTPRequestDuration.WithLabelValues("GET").Observe(0.05)
|
||||
m.HTTPResponseSizeBytes.WithLabelValues("GET").Observe(1024)
|
||||
|
||||
families, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
names := make(map[string]bool)
|
||||
|
||||
for _, f := range families {
|
||||
names[f.GetName()] = true
|
||||
}
|
||||
|
||||
assert.True(t, names["upaas_http_requests_total"])
|
||||
assert.True(t, names["upaas_http_request_duration_seconds"])
|
||||
assert.True(t, names["upaas_http_response_size_bytes"])
|
||||
}
|
||||
|
||||
func TestAuditMetrics(t *testing.T) {
|
||||
t.Parallel()
|
||||
|
||||
reg := prometheus.NewRegistry()
|
||||
m := metrics.NewForTest(reg)
|
||||
|
||||
m.AuditEventsTotal.WithLabelValues("login").Inc()
|
||||
|
||||
families, err := reg.Gather()
|
||||
require.NoError(t, err)
|
||||
|
||||
found := false
|
||||
|
||||
for _, f := range families {
|
||||
if f.GetName() == "upaas_audit_events_total" {
|
||||
found = true
|
||||
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
assert.True(t, found)
|
||||
}
|
||||
Reference in New Issue
Block a user