fix: clean up orphan resources on deploy cancellation (closes #89) #93

Merged
sneak merged 2 commits from fix/deploy-cancel-cleanup into main 2026-02-20 05:22:59 +01:00
4 changed files with 184 additions and 4 deletions

View File

@ -17,6 +17,7 @@ import (
"github.com/docker/docker/api/types" "github.com/docker/docker/api/types"
"github.com/docker/docker/api/types/container" "github.com/docker/docker/api/types/container"
"github.com/docker/docker/api/types/filters" "github.com/docker/docker/api/types/filters"
"github.com/docker/docker/api/types/image"
"github.com/docker/docker/api/types/mount" "github.com/docker/docker/api/types/mount"
"github.com/docker/docker/api/types/network" "github.com/docker/docker/api/types/network"
"github.com/docker/docker/client" "github.com/docker/docker/client"
@ -739,6 +740,20 @@ func (c *Client) connect(ctx context.Context) error {
return nil return nil
} }
// RemoveImage removes a Docker image by ID or tag.
// It returns nil if the image was successfully removed or does not exist.
func (c *Client) RemoveImage(ctx context.Context, imageID string) error {
_, err := c.docker.ImageRemove(ctx, imageID, image.RemoveOptions{
Force: true,
PruneChildren: true,
})
if err != nil && !client.IsErrNotFound(err) {
return fmt.Errorf("failed to remove image %s: %w", imageID, err)
}
return nil
}
func (c *Client) close() error { func (c *Client) close() error {
if c.docker != nil { if c.docker != nil {
err := c.docker.Close() err := c.docker.Close()

View File

@ -11,6 +11,7 @@ import (
"log/slog" "log/slog"
"os" "os"
"path/filepath" "path/filepath"
"strings"
"sync" "sync"
"time" "time"
@ -472,7 +473,7 @@ func (svc *Service) runBuildAndDeploy(
// Build phase with timeout // Build phase with timeout
imageID, err := svc.buildImageWithTimeout(deployCtx, app, deployment) imageID, err := svc.buildImageWithTimeout(deployCtx, app, deployment)
if err != nil { if err != nil {
cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment) cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment, "")
if cancelErr != nil { if cancelErr != nil {
return cancelErr return cancelErr
} }
@ -485,7 +486,7 @@ func (svc *Service) runBuildAndDeploy(
// Deploy phase with timeout // Deploy phase with timeout
err = svc.deployContainerWithTimeout(deployCtx, app, deployment, imageID) err = svc.deployContainerWithTimeout(deployCtx, app, deployment, imageID)
if err != nil { if err != nil {
cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment) cancelErr := svc.checkCancelled(deployCtx, bgCtx, app, deployment, imageID)
if cancelErr != nil { if cancelErr != nil {
return cancelErr return cancelErr
} }
@ -661,24 +662,76 @@ func (svc *Service) cancelActiveDeploy(appID string) {
} }
// checkCancelled checks if the deploy context was cancelled (by a newer deploy) // checkCancelled checks if the deploy context was cancelled (by a newer deploy)
// and if so, marks the deployment as cancelled. Returns ErrDeployCancelled or nil. // and if so, marks the deployment as cancelled and cleans up orphan resources.
// Returns ErrDeployCancelled or nil.
func (svc *Service) checkCancelled( func (svc *Service) checkCancelled(
deployCtx context.Context, deployCtx context.Context,
bgCtx context.Context, bgCtx context.Context,
app *models.App, app *models.App,
deployment *models.Deployment, deployment *models.Deployment,
imageID string,
) error { ) error {
if !errors.Is(deployCtx.Err(), context.Canceled) { if !errors.Is(deployCtx.Err(), context.Canceled) {
return nil return nil
} }
svc.log.Info("deployment cancelled by newer deploy", "app", app.Name) svc.log.Info("deployment cancelled", "app", app.Name)
svc.cleanupCancelledDeploy(bgCtx, app, deployment, imageID)
_ = deployment.MarkFinished(bgCtx, models.DeploymentStatusCancelled) _ = deployment.MarkFinished(bgCtx, models.DeploymentStatusCancelled)
return ErrDeployCancelled return ErrDeployCancelled
} }
// cleanupCancelledDeploy removes orphan resources left by a cancelled deployment.
func (svc *Service) cleanupCancelledDeploy(
ctx context.Context,
app *models.App,
deployment *models.Deployment,
imageID string,
) {
// Clean up the intermediate Docker image if one was built
if imageID != "" {
removeErr := svc.docker.RemoveImage(ctx, imageID)
if removeErr != nil {
svc.log.Error("failed to remove image from cancelled deploy",
"error", removeErr, "app", app.Name, "image", imageID)
_ = deployment.AppendLog(ctx, "WARNING: failed to clean up image "+imageID+": "+removeErr.Error())
} else {
svc.log.Info("cleaned up image from cancelled deploy",
"app", app.Name, "image", imageID)
_ = deployment.AppendLog(ctx, "Cleaned up intermediate image: "+imageID)
}
}
// Clean up the build directory for this deployment
buildDir := svc.GetBuildDir(app.Name)
entries, err := os.ReadDir(buildDir)
if err != nil {
return
}
prefix := fmt.Sprintf("%d-", deployment.ID)
for _, entry := range entries {
if entry.IsDir() && strings.HasPrefix(entry.Name(), prefix) {

Nit: entry.Name()[:len(prefix)] == prefix — prefer strings.HasPrefix(entry.Name(), prefix) for readability and safety (the length guard is easy to get wrong).

Nit: `entry.Name()[:len(prefix)] == prefix` — prefer `strings.HasPrefix(entry.Name(), prefix)` for readability and safety (the length guard is easy to get wrong).
dirPath := filepath.Join(buildDir, entry.Name())
removeErr := os.RemoveAll(dirPath)
if removeErr != nil {
svc.log.Error("failed to remove build dir from cancelled deploy",
"error", removeErr, "path", dirPath)
} else {
svc.log.Info("cleaned up build dir from cancelled deploy",
"app", app.Name, "path", dirPath)
_ = deployment.AppendLog(ctx, "Cleaned up build directory")
}
}
}
}
func (svc *Service) fetchWebhookEvent( func (svc *Service) fetchWebhookEvent(
ctx context.Context, ctx context.Context,
webhookEventID *int64, webhookEventID *int64,

View File

@ -0,0 +1,63 @@
package deploy_test
import (
"context"
"log/slog"
"os"
"path/filepath"
"testing"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"git.eeqj.de/sneak/upaas/internal/config"
"git.eeqj.de/sneak/upaas/internal/service/deploy"
)
func TestCleanupCancelledDeploy_RemovesBuildDir(t *testing.T) {
t.Parallel()
tmpDir := t.TempDir()
cfg := &config.Config{DataDir: tmpDir}
svc := deploy.NewTestServiceWithConfig(slog.Default(), cfg, nil)
// Create a fake build directory matching the deployment pattern
appName := "test-app"
buildDir := svc.GetBuildDirExported(appName)
require.NoError(t, os.MkdirAll(buildDir, 0o750))
// Create deployment-specific dir: <deploymentID>-<random>
deployDir := filepath.Join(buildDir, "42-abc123")
require.NoError(t, os.MkdirAll(deployDir, 0o750))
// Create a file inside to verify full removal
require.NoError(t, os.WriteFile(filepath.Join(deployDir, "work"), []byte("test"), 0o640))
// Also create a dir for a different deployment (should NOT be removed)
otherDir := filepath.Join(buildDir, "99-xyz789")
require.NoError(t, os.MkdirAll(otherDir, 0o750))
// Run cleanup for deployment 42
svc.CleanupCancelledDeploy(context.Background(), appName, 42, "")
// Deployment 42's dir should be gone
_, err := os.Stat(deployDir)
assert.True(t, os.IsNotExist(err), "deployment build dir should be removed")
// Deployment 99's dir should still exist
_, err = os.Stat(otherDir)
assert.NoError(t, err, "other deployment build dir should not be removed")
}
func TestCleanupCancelledDeploy_NoBuildDir(t *testing.T) {
t.Parallel()
tmpDir := t.TempDir()
cfg := &config.Config{DataDir: tmpDir}
svc := deploy.NewTestServiceWithConfig(slog.Default(), cfg, nil)
// Should not panic when build dir doesn't exist
svc.CleanupCancelledDeploy(context.Background(), "nonexistent-app", 1, "")
}

View File

@ -2,7 +2,14 @@ package deploy
import ( import (
"context" "context"
"fmt"
"log/slog" "log/slog"
"os"
"path/filepath"
"strings"
"git.eeqj.de/sneak/upaas/internal/config"
"git.eeqj.de/sneak/upaas/internal/docker"
) )
// NewTestService creates a Service with minimal dependencies for testing. // NewTestService creates a Service with minimal dependencies for testing.
@ -31,3 +38,45 @@ func (svc *Service) TryLockApp(appID string) bool {
func (svc *Service) UnlockApp(appID string) { func (svc *Service) UnlockApp(appID string) {
svc.unlockApp(appID) svc.unlockApp(appID)
} }
// NewTestServiceWithConfig creates a Service with config and docker client for testing.
func NewTestServiceWithConfig(log *slog.Logger, cfg *config.Config, dockerClient *docker.Client) *Service {
return &Service{
log: log,
config: cfg,
docker: dockerClient,
}
}
// CleanupCancelledDeploy exposes the build directory cleanup portion of
// cleanupCancelledDeploy for testing. It removes build directories matching
// the deployment ID prefix.
func (svc *Service) CleanupCancelledDeploy(
ctx context.Context,

This duplicates the directory cleanup logic from the real cleanupCancelledDeploy. If the real implementation changes (e.g. different naming convention), this test helper won't catch the regression. Consider refactoring so the test exercises the actual code path.

This duplicates the directory cleanup logic from the real `cleanupCancelledDeploy`. If the real implementation changes (e.g. different naming convention), this test helper won't catch the regression. Consider refactoring so the test exercises the actual code path.
appName string,
deploymentID int64,
imageID string,
) {
// We can't create real models.App/Deployment in tests easily,
// so we test the build dir cleanup portion directly.
buildDir := svc.GetBuildDir(appName)
entries, err := os.ReadDir(buildDir)
if err != nil {
return
}
prefix := fmt.Sprintf("%d-", deploymentID)
for _, entry := range entries {
if entry.IsDir() && strings.HasPrefix(entry.Name(), prefix) {
dirPath := filepath.Join(buildDir, entry.Name())
_ = os.RemoveAll(dirPath)
}
}
}
// GetBuildDirExported exposes GetBuildDir for testing.
func (svc *Service) GetBuildDirExported(appName string) string {
return svc.GetBuildDir(appName)
}