From 3c890f0b83eb11b30df22b81f27009d91fddc989 Mon Sep 17 00:00:00 2001 From: sneak Date: Mon, 9 Feb 2026 01:48:42 -0800 Subject: [PATCH] speed up database pruning for throwaway copies Disable crash-safety pragmas (synchronous=OFF, journal_mode=MEMORY), use a 200MB page cache, disable foreign keys to avoid per-row CASCADE overhead, bulk-delete junction tables explicitly, and wrap all deletes in a single transaction. --- internal/bsdaily/prune.go | 50 +++++++++++++++++++++++++++++++-------- 1 file changed, 40 insertions(+), 10 deletions(-) diff --git a/internal/bsdaily/prune.go b/internal/bsdaily/prune.go index 9225b04..7aa6319 100644 --- a/internal/bsdaily/prune.go +++ b/internal/bsdaily/prune.go @@ -16,7 +16,9 @@ func PruneDatabase(dbPath string, targetDay time.Time) error { slog.Info("pruning database", "keep_from", dayStart, "keep_until", dayEnd) - db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)") + // No foreign_keys — we handle junction tables manually to avoid per-row CASCADE overhead. + // journal_mode=MEMORY and synchronous=OFF are safe because this is a throwaway copy. + db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(MEMORY)&_pragma=synchronous(OFF)&_pragma=cache_size(-200000)") if err != nil { return fmt.Errorf("opening database: %w", err) } @@ -42,18 +44,42 @@ func PruneDatabase(dbPath string, targetDay time.Time) error { targetDay.Format("2006-01-02")) } - // Delete posts outside target day (CASCADE handles junction tables) + tx, err := db.Begin() + if err != nil { + return fmt.Errorf("beginning transaction: %w", err) + } + defer tx.Rollback() + + // Delete junction table rows for posts outside target day (bulk, no CASCADE overhead) + slog.Info("deleting junction table rows for non-target posts") + keepSubquery := "SELECT id FROM posts WHERE timestamp >= ? AND timestamp < ?" + + result, err := tx.Exec("DELETE FROM posts_hashtags WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd) + if err != nil { + return fmt.Errorf("deleting posts_hashtags: %w", err) + } + deleted, _ := result.RowsAffected() + slog.Info("deleted posts_hashtags rows", "count", deleted) + + result, err = tx.Exec("DELETE FROM posts_urls WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd) + if err != nil { + return fmt.Errorf("deleting posts_urls: %w", err) + } + deleted, _ = result.RowsAffected() + slog.Info("deleted posts_urls rows", "count", deleted) + + // Delete posts outside target day slog.Info("deleting posts outside target day") - result, err := db.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd) + result, err = tx.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd) if err != nil { return fmt.Errorf("deleting posts: %w", err) } - deleted, _ := result.RowsAffected() + deleted, _ = result.RowsAffected() slog.Info("deleted posts", "count", deleted) // Clean up orphaned hashtags slog.Info("cleaning orphaned hashtags") - result, err = db.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)") + result, err = tx.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)") if err != nil { return fmt.Errorf("deleting orphaned hashtags: %w", err) } @@ -62,16 +88,16 @@ func PruneDatabase(dbPath string, targetDay time.Time) error { // Clean up orphaned urls slog.Info("cleaning orphaned urls") - result, err = db.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)") + result, err = tx.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)") if err != nil { return fmt.Errorf("deleting orphaned urls: %w", err) } deleted, _ = result.RowsAffected() slog.Info("deleted orphaned urls", "count", deleted) - // Clear media (no FK to posts, not useful in daily segment) + // Clear media (no FK to posts, can't prune until post_id migration lands) slog.Info("clearing media table") - result, err = db.Exec("DELETE FROM media") + result, err = tx.Exec("DELETE FROM media") if err != nil { return fmt.Errorf("deleting media: %w", err) } @@ -80,13 +106,17 @@ func PruneDatabase(dbPath string, targetDay time.Time) error { // Clean up orphaned users slog.Info("cleaning orphaned users") - result, err = db.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)") + result, err = tx.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)") if err != nil { return fmt.Errorf("deleting orphaned users: %w", err) } deleted, _ = result.RowsAffected() slog.Info("deleted orphaned users", "count", deleted) + if err := tx.Commit(); err != nil { + return fmt.Errorf("committing transaction: %w", err) + } + // Verify remaining count var remaining int64 if err := db.QueryRow("SELECT COUNT(*) FROM posts").Scan(&remaining); err != nil { @@ -98,7 +128,7 @@ func PruneDatabase(dbPath string, targetDay time.Time) error { slog.Info("posts remaining after prune", "count", remaining) // VACUUM to reclaim space - slog.Info("running VACUUM (this may take a while)") + slog.Info("running VACUUM") if _, err := db.Exec("VACUUM"); err != nil { return fmt.Errorf("vacuum: %w", err) }