feat: add observability improvements (metrics, audit log, structured logging)
All checks were successful
Check / check (pull_request) Successful in 1m45s

- Add Prometheus metrics package (internal/metrics) with deployment,
  container health, webhook, HTTP request, and audit counters/histograms
- Add audit_log SQLite table via migration 007
- Add AuditEntry model with CRUD operations and query methods
- Add audit service (internal/service/audit) for recording user actions
- Instrument deploy service with deployment duration, count, and
  in-flight metrics; container health gauge updates on deploy completion
- Instrument webhook service with event counters by app/type/matched
- Instrument HTTP middleware with request count, duration, and response
  size metrics; also log response bytes in structured request logs
- Add audit logging to all key handler operations: login/logout, app
  CRUD, deploy, cancel, rollback, restart/stop/start, webhook receipt,
  and initial setup
- Add GET /api/audit endpoint for querying recent audit entries
- Make /metrics endpoint always available (optionally auth-protected)
- Add comprehensive tests for metrics, audit model, and audit service
- Update existing test infrastructure with metrics and audit dependencies
- Update README with Observability section documenting all metrics,
  audit log, and structured logging
This commit is contained in:
clawbot
2026-03-17 02:23:44 -07:00
parent fd110e69db
commit f558e2cdd8
21 changed files with 1399 additions and 42 deletions

View File

@@ -0,0 +1,158 @@
package metrics_test
import (
"testing"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/fx"
"sneak.berlin/go/upaas/internal/metrics"
)
func TestNewForTest(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
require.NotNil(t, m)
assert.NotNil(t, m.DeploymentsTotal)
assert.NotNil(t, m.DeploymentDuration)
assert.NotNil(t, m.DeploymentsInFlight)
assert.NotNil(t, m.ContainerHealthy)
assert.NotNil(t, m.WebhookEventsTotal)
assert.NotNil(t, m.HTTPRequestsTotal)
assert.NotNil(t, m.HTTPRequestDuration)
assert.NotNil(t, m.HTTPResponseSizeBytes)
assert.NotNil(t, m.AuditEventsTotal)
}
func TestNew(t *testing.T) {
t.Parallel()
m, err := metrics.New(fx.Lifecycle(nil), metrics.Params{})
require.NoError(t, err)
require.NotNil(t, m)
}
func TestDeploymentMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
m.DeploymentsTotal.WithLabelValues("test-app", "success").Inc()
m.DeploymentDuration.WithLabelValues("test-app", "success").Observe(42.5)
m.DeploymentsInFlight.WithLabelValues("test-app").Set(1)
families, err := reg.Gather()
require.NoError(t, err)
names := make(map[string]bool)
for _, f := range families {
names[f.GetName()] = true
}
assert.True(t, names["upaas_deployments_total"])
assert.True(t, names["upaas_deployments_duration_seconds"])
assert.True(t, names["upaas_deployments_in_flight"])
}
func TestContainerHealthMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
m.ContainerHealthy.WithLabelValues("my-app").Set(1)
families, err := reg.Gather()
require.NoError(t, err)
found := false
for _, f := range families {
if f.GetName() == "upaas_container_healthy" {
found = true
break
}
}
assert.True(t, found)
}
func TestWebhookMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
m.WebhookEventsTotal.WithLabelValues("test-app", "push", "true").Inc()
families, err := reg.Gather()
require.NoError(t, err)
found := false
for _, f := range families {
if f.GetName() == "upaas_webhook_events_total" {
found = true
break
}
}
assert.True(t, found)
}
func TestHTTPMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
m.HTTPRequestsTotal.WithLabelValues("GET", "200").Inc()
m.HTTPRequestDuration.WithLabelValues("GET").Observe(0.05)
m.HTTPResponseSizeBytes.WithLabelValues("GET").Observe(1024)
families, err := reg.Gather()
require.NoError(t, err)
names := make(map[string]bool)
for _, f := range families {
names[f.GetName()] = true
}
assert.True(t, names["upaas_http_requests_total"])
assert.True(t, names["upaas_http_request_duration_seconds"])
assert.True(t, names["upaas_http_response_size_bytes"])
}
func TestAuditMetrics(t *testing.T) {
t.Parallel()
reg := prometheus.NewRegistry()
m := metrics.NewForTest(reg)
m.AuditEventsTotal.WithLabelValues("login").Inc()
families, err := reg.Gather()
require.NoError(t, err)
found := false
for _, f := range families {
if f.GetName() == "upaas_audit_events_total" {
found = true
break
}
}
assert.True(t, found)
}