feat: add observability improvements (metrics, audit log, structured logging)
All checks were successful
Check / check (pull_request) Successful in 1m45s

- Add Prometheus metrics package (internal/metrics) with deployment,
  container health, webhook, HTTP request, and audit counters/histograms
- Add audit_log SQLite table via migration 007
- Add AuditEntry model with CRUD operations and query methods
- Add audit service (internal/service/audit) for recording user actions
- Instrument deploy service with deployment duration, count, and
  in-flight metrics; container health gauge updates on deploy completion
- Instrument webhook service with event counters by app/type/matched
- Instrument HTTP middleware with request count, duration, and response
  size metrics; also log response bytes in structured request logs
- Add audit logging to all key handler operations: login/logout, app
  CRUD, deploy, cancel, rollback, restart/stop/start, webhook receipt,
  and initial setup
- Add GET /api/audit endpoint for querying recent audit entries
- Make /metrics endpoint always available (optionally auth-protected)
- Add comprehensive tests for metrics, audit model, and audit service
- Update existing test infrastructure with metrics and audit dependencies
- Update README with Observability section documenting all metrics,
  audit log, and structured logging
This commit is contained in:
clawbot
2026-03-17 02:23:44 -07:00
parent fd110e69db
commit f558e2cdd8
21 changed files with 1399 additions and 42 deletions

View File

@@ -1,6 +1,7 @@
package handlers
import (
"database/sql"
"encoding/json"
"net/http"
"strconv"
@@ -120,6 +121,9 @@ func (h *Handlers) HandleAPILoginPOST() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionLogin,
models.AuditResourceSession, "", "api login")
h.respondJSON(writer, request, loginResponse{
UserID: user.ID,
Username: user.Username,
@@ -243,3 +247,79 @@ func (h *Handlers) HandleAPIWhoAmI() http.HandlerFunc {
}, http.StatusOK)
}
}
// auditLogDefaultLimit is the default number of audit entries returned.
const auditLogDefaultLimit = 50
// auditLogMaxLimit is the maximum number of audit entries returned.
const auditLogMaxLimit = 500
// HandleAPIAuditLog returns a handler that lists recent audit log entries.
func (h *Handlers) HandleAPIAuditLog() http.HandlerFunc {
type auditEntryResponse struct {
ID int64 `json:"id"`
UserID *int64 `json:"userId,omitempty"`
Username string `json:"username"`
Action string `json:"action"`
ResourceType string `json:"resourceType"`
ResourceID string `json:"resourceId,omitempty"`
Detail string `json:"detail,omitempty"`
RemoteIP string `json:"remoteIp,omitempty"`
CreatedAt string `json:"createdAt"`
}
return func(writer http.ResponseWriter, request *http.Request) {
limit := auditLogDefaultLimit
if limitStr := request.URL.Query().Get("limit"); limitStr != "" {
parsed, parseErr := strconv.Atoi(limitStr)
if parseErr == nil && parsed > 0 && parsed <= auditLogMaxLimit {
limit = parsed
}
}
entries, err := h.audit.Recent(request.Context(), limit)
if err != nil {
h.log.Error("failed to fetch audit log", "error", err)
h.respondJSON(writer, request,
map[string]string{"error": "failed to fetch audit log"},
http.StatusInternalServerError)
return
}
result := make([]auditEntryResponse, 0, len(entries))
for _, e := range entries {
entry := auditEntryResponse{
ID: e.ID,
Username: e.Username,
Action: string(e.Action),
ResourceType: string(e.ResourceType),
CreatedAt: e.CreatedAt.UTC().Format("2006-01-02T15:04:05Z"),
}
if e.UserID.Valid {
id := e.UserID.Int64
entry.UserID = &id
}
entry.ResourceID = nullStringValue(e.ResourceID)
entry.Detail = nullStringValue(e.Detail)
entry.RemoteIP = nullStringValue(e.RemoteIP)
result = append(result, entry)
}
h.respondJSON(writer, request, result, http.StatusOK)
}
}
// nullStringValue returns the string value if valid, empty string otherwise.
func nullStringValue(ns sql.NullString) string {
if ns.Valid {
return ns.String
}
return ""
}

View File

@@ -119,6 +119,9 @@ func (h *Handlers) HandleAppCreate() http.HandlerFunc { //nolint:funlen // valid
return
}
h.auditLog(request, models.AuditActionAppCreate,
models.AuditResourceApp, createdApp.ID, "created app: "+createdApp.Name)
http.Redirect(writer, request, "/apps/"+createdApp.ID, http.StatusSeeOther)
}
}
@@ -289,6 +292,9 @@ func (h *Handlers) HandleAppUpdate() http.HandlerFunc { //nolint:funlen // valid
return
}
h.auditLog(request, models.AuditActionAppUpdate,
models.AuditResourceApp, application.ID, "updated app: "+application.Name)
redirectURL := "/apps/" + application.ID + "?success=updated"
http.Redirect(writer, request, redirectURL, http.StatusSeeOther)
}
@@ -344,6 +350,9 @@ func (h *Handlers) HandleAppDelete() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionAppDelete,
models.AuditResourceApp, appID, "deleted app: "+application.Name)
http.Redirect(writer, request, "/", http.StatusSeeOther)
}
}
@@ -360,6 +369,9 @@ func (h *Handlers) HandleAppDeploy() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionAppDeploy,
models.AuditResourceApp, application.ID, "manual deploy: "+application.Name)
// Trigger deployment in background with a detached context
// so the deployment continues even if the HTTP request is cancelled
deployCtx := context.WithoutCancel(request.Context())
@@ -399,6 +411,8 @@ func (h *Handlers) HandleCancelDeploy() http.HandlerFunc {
cancelled := h.deploy.CancelDeploy(application.ID)
if cancelled {
h.log.Info("deployment cancelled by user", "app", application.Name)
h.auditLog(request, models.AuditActionDeployCancel,
models.AuditResourceDeployment, application.ID, "cancelled deploy: "+application.Name)
}
http.Redirect(
@@ -430,6 +444,9 @@ func (h *Handlers) HandleAppRollback() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionAppRollback,
models.AuditResourceApp, application.ID, "rolled back: "+application.Name)
http.Redirect(writer, request, "/apps/"+application.ID+"?success=rolledback", http.StatusSeeOther)
}
}
@@ -834,11 +851,29 @@ func (h *Handlers) handleContainerAction(
} else {
h.log.Info("container action completed",
"action", action, "app", application.Name, "container", containerID)
auditAction := containerActionToAuditAction(action)
h.auditLog(request, auditAction,
models.AuditResourceApp, appID, string(action)+" container: "+application.Name)
}
http.Redirect(writer, request, "/apps/"+appID, http.StatusSeeOther)
}
// containerActionToAuditAction maps container actions to audit actions.
func containerActionToAuditAction(action containerAction) models.AuditAction {
switch action {
case actionRestart:
return models.AuditActionAppRestart
case actionStop:
return models.AuditActionAppStop
case actionStart:
return models.AuditActionAppStart
default:
return models.AuditAction("app." + string(action))
}
}
// HandleAppRestart handles restarting an app's container.
func (h *Handlers) HandleAppRestart() http.HandlerFunc {
return func(writer http.ResponseWriter, request *http.Request) {

View File

@@ -3,6 +3,7 @@ package handlers
import (
"net/http"
"sneak.berlin/go/upaas/internal/models"
"sneak.berlin/go/upaas/templates"
)
@@ -61,6 +62,9 @@ func (h *Handlers) HandleLoginPOST() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionLogin,
models.AuditResourceSession, "", "user logged in")
http.Redirect(writer, request, "/", http.StatusSeeOther)
}
}
@@ -68,6 +72,9 @@ func (h *Handlers) HandleLoginPOST() http.HandlerFunc {
// HandleLogout handles logout requests.
func (h *Handlers) HandleLogout() http.HandlerFunc {
return func(writer http.ResponseWriter, request *http.Request) {
h.auditLog(request, models.AuditActionLogout,
models.AuditResourceSession, "", "user logged out")
destroyErr := h.auth.DestroySession(writer, request)
if destroyErr != nil {
h.log.Error("failed to destroy session", "error", destroyErr)

View File

@@ -15,7 +15,9 @@ import (
"sneak.berlin/go/upaas/internal/globals"
"sneak.berlin/go/upaas/internal/healthcheck"
"sneak.berlin/go/upaas/internal/logger"
"sneak.berlin/go/upaas/internal/models"
"sneak.berlin/go/upaas/internal/service/app"
"sneak.berlin/go/upaas/internal/service/audit"
"sneak.berlin/go/upaas/internal/service/auth"
"sneak.berlin/go/upaas/internal/service/deploy"
"sneak.berlin/go/upaas/internal/service/webhook"
@@ -35,6 +37,7 @@ type Params struct {
Deploy *deploy.Service
Webhook *webhook.Service
Docker *docker.Client
Audit *audit.Service
}
// Handlers provides HTTP request handlers.
@@ -48,6 +51,7 @@ type Handlers struct {
deploy *deploy.Service
webhook *webhook.Service
docker *docker.Client
audit *audit.Service
globals *globals.Globals
}
@@ -63,10 +67,48 @@ func New(_ fx.Lifecycle, params Params) (*Handlers, error) {
deploy: params.Deploy,
webhook: params.Webhook,
docker: params.Docker,
audit: params.Audit,
globals: params.Globals,
}, nil
}
// currentUser returns the currently authenticated user, or nil if not authenticated.
func (h *Handlers) currentUser(request *http.Request) *models.User {
user, err := h.auth.GetCurrentUser(request.Context(), request)
if err != nil || user == nil {
return nil
}
return user
}
// auditLog records an audit entry for the current request.
func (h *Handlers) auditLog(
request *http.Request,
action models.AuditAction,
resourceType models.AuditResourceType,
resourceID string,
detail string,
) {
user := h.currentUser(request)
entry := audit.LogEntry{
Action: action,
ResourceType: resourceType,
ResourceID: resourceID,
Detail: detail,
}
if user != nil {
entry.UserID = user.ID
entry.Username = user.Username
} else {
entry.Username = "anonymous"
}
h.audit.LogFromRequest(request.Context(), request, entry)
}
// addGlobals adds version info and CSRF token to template data map.
func (h *Handlers) addGlobals(
data map[string]any,

View File

@@ -11,6 +11,7 @@ import (
"time"
"github.com/go-chi/chi/v5"
"github.com/prometheus/client_golang/prometheus"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"go.uber.org/fx"
@@ -24,8 +25,10 @@ import (
"sneak.berlin/go/upaas/internal/handlers"
"sneak.berlin/go/upaas/internal/healthcheck"
"sneak.berlin/go/upaas/internal/logger"
"sneak.berlin/go/upaas/internal/metrics"
"sneak.berlin/go/upaas/internal/middleware"
"sneak.berlin/go/upaas/internal/service/app"
"sneak.berlin/go/upaas/internal/service/audit"
"sneak.berlin/go/upaas/internal/service/auth"
"sneak.berlin/go/upaas/internal/service/deploy"
"sneak.berlin/go/upaas/internal/service/notify"
@@ -92,7 +95,8 @@ func createAppServices(
logInstance *logger.Logger,
dbInstance *database.Database,
cfg *config.Config,
) (*auth.Service, *app.Service, *deploy.Service, *webhook.Service, *docker.Client) {
metricsInstance *metrics.Metrics,
) (*auth.Service, *app.Service, *deploy.Service, *webhook.Service, *docker.Client, *audit.Service) {
t.Helper()
authSvc, authErr := auth.New(fx.Lifecycle(nil), auth.ServiceParams{
@@ -125,6 +129,7 @@ func createAppServices(
Database: dbInstance,
Docker: dockerClient,
Notify: notifySvc,
Metrics: metricsInstance,
})
require.NoError(t, deployErr)
@@ -132,10 +137,18 @@ func createAppServices(
Logger: logInstance,
Database: dbInstance,
Deploy: deploySvc,
Metrics: metricsInstance,
})
require.NoError(t, webhookErr)
return authSvc, appSvc, deploySvc, webhookSvc, dockerClient
auditSvc, auditErr := audit.New(fx.Lifecycle(nil), audit.ServiceParams{
Logger: logInstance,
Database: dbInstance,
Metrics: metricsInstance,
})
require.NoError(t, auditErr)
return authSvc, appSvc, deploySvc, webhookSvc, dockerClient, auditSvc
}
func setupTestHandlers(t *testing.T) *testContext {
@@ -145,11 +158,14 @@ func setupTestHandlers(t *testing.T) *testContext {
globalInstance, logInstance, dbInstance, hcInstance := createCoreServices(t, cfg)
authSvc, appSvc, deploySvc, webhookSvc, dockerClient := createAppServices(
metricsInstance := metrics.NewForTest(prometheus.NewRegistry())
authSvc, appSvc, deploySvc, webhookSvc, dockerClient, auditSvc := createAppServices(
t,
logInstance,
dbInstance,
cfg,
metricsInstance,
)
handlersInstance, handlerErr := handlers.New(
@@ -164,6 +180,7 @@ func setupTestHandlers(t *testing.T) *testContext {
Deploy: deploySvc,
Webhook: webhookSvc,
Docker: dockerClient,
Audit: auditSvc,
},
)
require.NoError(t, handlerErr)
@@ -173,6 +190,7 @@ func setupTestHandlers(t *testing.T) *testContext {
Globals: globalInstance,
Config: cfg,
Auth: authSvc,
Metrics: metricsInstance,
})
require.NoError(t, mwErr)

View File

@@ -3,6 +3,7 @@ package handlers
import (
"net/http"
"sneak.berlin/go/upaas/internal/models"
"sneak.berlin/go/upaas/templates"
)
@@ -111,6 +112,9 @@ func (h *Handlers) HandleSetupPOST() http.HandlerFunc {
return
}
h.auditLog(request, models.AuditActionSetup,
models.AuditResourceUser, "", "initial setup completed")
http.Redirect(writer, request, "/", http.StatusSeeOther)
}
}

View File

@@ -7,13 +7,14 @@ import (
"github.com/go-chi/chi/v5"
"sneak.berlin/go/upaas/internal/models"
"sneak.berlin/go/upaas/internal/service/audit"
)
// maxWebhookBodySize is the maximum allowed size of a webhook request body (1MB).
const maxWebhookBodySize = 1 << 20
// HandleWebhook handles incoming Gitea webhooks.
func (h *Handlers) HandleWebhook() http.HandlerFunc {
func (h *Handlers) HandleWebhook() http.HandlerFunc { //nolint:funlen // audit logging adds necessary length
return func(writer http.ResponseWriter, request *http.Request) {
secret := chi.URLParam(request, "secret")
if secret == "" {
@@ -56,6 +57,15 @@ func (h *Handlers) HandleWebhook() http.HandlerFunc {
eventType = "push"
}
// Log webhook receipt
h.audit.LogFromRequest(request.Context(), request, audit.LogEntry{
Username: "webhook",
Action: models.AuditActionWebhookReceive,
ResourceType: models.AuditResourceWebhook,
ResourceID: application.ID,
Detail: "webhook from app: " + application.Name + ", event: " + eventType,
})
// Process webhook
webhookErr := h.webhook.HandleWebhook(
request.Context(),