feat: add observability improvements (metrics, audit log, structured logging)
All checks were successful
Check / check (pull_request) Successful in 1m45s

- Add Prometheus metrics package (internal/metrics) with deployment,
  container health, webhook, HTTP request, and audit counters/histograms
- Add audit_log SQLite table via migration 007
- Add AuditEntry model with CRUD operations and query methods
- Add audit service (internal/service/audit) for recording user actions
- Instrument deploy service with deployment duration, count, and
  in-flight metrics; container health gauge updates on deploy completion
- Instrument webhook service with event counters by app/type/matched
- Instrument HTTP middleware with request count, duration, and response
  size metrics; also log response bytes in structured request logs
- Add audit logging to all key handler operations: login/logout, app
  CRUD, deploy, cancel, rollback, restart/stop/start, webhook receipt,
  and initial setup
- Add GET /api/audit endpoint for querying recent audit entries
- Make /metrics endpoint always available (optionally auth-protected)
- Add comprehensive tests for metrics, audit model, and audit service
- Update existing test infrastructure with metrics and audit dependencies
- Update README with Observability section documenting all metrics,
  audit log, and structured logging
This commit is contained in:
clawbot
2026-03-17 02:23:44 -07:00
parent fd110e69db
commit f558e2cdd8
21 changed files with 1399 additions and 42 deletions

View File

@@ -21,6 +21,7 @@ import (
"sneak.berlin/go/upaas/internal/database"
"sneak.berlin/go/upaas/internal/docker"
"sneak.berlin/go/upaas/internal/logger"
"sneak.berlin/go/upaas/internal/metrics"
"sneak.berlin/go/upaas/internal/models"
"sneak.berlin/go/upaas/internal/service/notify"
)
@@ -208,6 +209,7 @@ type ServiceParams struct {
Database *database.Database
Docker *docker.Client
Notify *notify.Service
Metrics *metrics.Metrics
}
// activeDeploy tracks a running deployment so it can be cancelled.
@@ -222,6 +224,7 @@ type Service struct {
db *database.Database
docker *docker.Client
notify *notify.Service
metrics *metrics.Metrics
config *config.Config
params *ServiceParams
activeDeploys sync.Map // map[string]*activeDeploy - per-app active deployment tracking
@@ -231,12 +234,13 @@ type Service struct {
// New creates a new deploy Service.
func New(lc fx.Lifecycle, params ServiceParams) (*Service, error) {
svc := &Service{
log: params.Logger.Get(),
db: params.Database,
docker: params.Docker,
notify: params.Notify,
config: params.Config,
params: &params,
log: params.Logger.Get(),
db: params.Database,
docker: params.Docker,
notify: params.Notify,
metrics: params.Metrics,
config: params.Config,
params: &params,
}
if lc != nil {
@@ -327,6 +331,11 @@ func (svc *Service) Deploy(
}
defer svc.unlockApp(app.ID)
// Track in-flight deployments
svc.metrics.DeploymentsInFlight.WithLabelValues(app.Name).Inc()
deployStart := time.Now()
// Set up cancellable context and register as active deploy
deployCtx, cancel := context.WithCancel(ctx)
done := make(chan struct{})
@@ -334,6 +343,7 @@ func (svc *Service) Deploy(
svc.activeDeploys.Store(app.ID, ad)
defer func() {
svc.metrics.DeploymentsInFlight.WithLabelValues(app.Name).Dec()
cancel()
close(done)
svc.activeDeploys.Delete(app.ID)
@@ -359,7 +369,7 @@ func (svc *Service) Deploy(
svc.notify.NotifyBuildStart(bgCtx, app, deployment)
return svc.runBuildAndDeploy(deployCtx, bgCtx, app, deployment)
return svc.runBuildAndDeploy(deployCtx, bgCtx, app, deployment, deployStart)
}
// Rollback rolls back an app to its previous image.
@@ -467,15 +477,20 @@ func (svc *Service) runBuildAndDeploy(
bgCtx context.Context,
app *models.App,
deployment *models.Deployment,
deployStart time.Time,
) error {
// Build phase with timeout
imageID, err := svc.buildImageWithTimeout(deployCtx, app, deployment)
if err != nil {
cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment, "")
if cancelErr != nil {
svc.recordDeployMetrics(app.Name, "cancelled", deployStart)
return cancelErr
}
svc.recordDeployMetrics(app.Name, "failed", deployStart)
return err
}
@@ -486,9 +501,13 @@ func (svc *Service) runBuildAndDeploy(
if err != nil {
cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment, imageID)
if cancelErr != nil {
svc.recordDeployMetrics(app.Name, "cancelled", deployStart)
return cancelErr
}
svc.recordDeployMetrics(app.Name, "failed", deployStart)
return err
}
@@ -504,11 +523,19 @@ func (svc *Service) runBuildAndDeploy(
// Use context.WithoutCancel to ensure health check completes even if
// the parent context is cancelled (e.g., HTTP request ends).
go svc.checkHealthAfterDelay(bgCtx, app, deployment)
go svc.checkHealthAfterDelay(bgCtx, app, deployment, deployStart)
return nil
}
// recordDeployMetrics records deployment completion metrics.
func (svc *Service) recordDeployMetrics(appName, status string, start time.Time) {
duration := time.Since(start).Seconds()
svc.metrics.DeploymentsTotal.WithLabelValues(appName, status).Inc()
svc.metrics.DeploymentDuration.WithLabelValues(appName, status).Observe(duration)
}
// buildImageWithTimeout runs the build phase with a timeout.
func (svc *Service) buildImageWithTimeout(
ctx context.Context,
@@ -1163,6 +1190,7 @@ func (svc *Service) checkHealthAfterDelay(
ctx context.Context,
app *models.App,
deployment *models.Deployment,
deployStart time.Time,
) {
svc.log.Info(
"waiting 60 seconds to check container health",
@@ -1189,6 +1217,8 @@ func (svc *Service) checkHealthAfterDelay(
svc.notify.NotifyDeployFailed(ctx, reloadedApp, deployment, err)
_ = deployment.MarkFinished(ctx, models.DeploymentStatusFailed)
svc.writeLogsToFile(reloadedApp, deployment)
svc.recordDeployMetrics(reloadedApp.Name, "failed", deployStart)
svc.metrics.ContainerHealthy.WithLabelValues(reloadedApp.Name).Set(0)
reloadedApp.Status = models.AppStatusError
_ = reloadedApp.Save(ctx)
@@ -1200,6 +1230,8 @@ func (svc *Service) checkHealthAfterDelay(
svc.notify.NotifyDeploySuccess(ctx, reloadedApp, deployment)
_ = deployment.MarkFinished(ctx, models.DeploymentStatusSuccess)
svc.writeLogsToFile(reloadedApp, deployment)
svc.recordDeployMetrics(reloadedApp.Name, "success", deployStart)
svc.metrics.ContainerHealthy.WithLabelValues(reloadedApp.Name).Set(1)
} else {
svc.log.Warn(
"container unhealthy after 60 seconds",
@@ -1208,6 +1240,8 @@ func (svc *Service) checkHealthAfterDelay(
svc.notify.NotifyDeployFailed(ctx, reloadedApp, deployment, ErrContainerUnhealthy)
_ = deployment.MarkFinished(ctx, models.DeploymentStatusFailed)
svc.writeLogsToFile(reloadedApp, deployment)
svc.recordDeployMetrics(reloadedApp.Name, "failed", deployStart)
svc.metrics.ContainerHealthy.WithLabelValues(reloadedApp.Name).Set(0)
reloadedApp.Status = models.AppStatusError
_ = reloadedApp.Save(ctx)
}