speed up database pruning for throwaway copies

Disable crash-safety pragmas (synchronous=OFF, journal_mode=MEMORY),
use a 200MB page cache, disable foreign keys to avoid per-row CASCADE
overhead, bulk-delete junction tables explicitly, and wrap all deletes
in a single transaction.
This commit is contained in:
Jeffrey Paul 2026-02-09 01:48:42 -08:00
parent c31484f0cf
commit 3c890f0b83

View File

@ -16,7 +16,9 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
slog.Info("pruning database", "keep_from", dayStart, "keep_until", dayEnd)
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)")
// No foreign_keys — we handle junction tables manually to avoid per-row CASCADE overhead.
// journal_mode=MEMORY and synchronous=OFF are safe because this is a throwaway copy.
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(MEMORY)&_pragma=synchronous(OFF)&_pragma=cache_size(-200000)")
if err != nil {
return fmt.Errorf("opening database: %w", err)
}
@ -42,18 +44,42 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
targetDay.Format("2006-01-02"))
}
// Delete posts outside target day (CASCADE handles junction tables)
tx, err := db.Begin()
if err != nil {
return fmt.Errorf("beginning transaction: %w", err)
}
defer tx.Rollback()
// Delete junction table rows for posts outside target day (bulk, no CASCADE overhead)
slog.Info("deleting junction table rows for non-target posts")
keepSubquery := "SELECT id FROM posts WHERE timestamp >= ? AND timestamp < ?"
result, err := tx.Exec("DELETE FROM posts_hashtags WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd)
if err != nil {
return fmt.Errorf("deleting posts_hashtags: %w", err)
}
deleted, _ := result.RowsAffected()
slog.Info("deleted posts_hashtags rows", "count", deleted)
result, err = tx.Exec("DELETE FROM posts_urls WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd)
if err != nil {
return fmt.Errorf("deleting posts_urls: %w", err)
}
deleted, _ = result.RowsAffected()
slog.Info("deleted posts_urls rows", "count", deleted)
// Delete posts outside target day
slog.Info("deleting posts outside target day")
result, err := db.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd)
result, err = tx.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd)
if err != nil {
return fmt.Errorf("deleting posts: %w", err)
}
deleted, _ := result.RowsAffected()
deleted, _ = result.RowsAffected()
slog.Info("deleted posts", "count", deleted)
// Clean up orphaned hashtags
slog.Info("cleaning orphaned hashtags")
result, err = db.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)")
result, err = tx.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)")
if err != nil {
return fmt.Errorf("deleting orphaned hashtags: %w", err)
}
@ -62,16 +88,16 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
// Clean up orphaned urls
slog.Info("cleaning orphaned urls")
result, err = db.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)")
result, err = tx.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)")
if err != nil {
return fmt.Errorf("deleting orphaned urls: %w", err)
}
deleted, _ = result.RowsAffected()
slog.Info("deleted orphaned urls", "count", deleted)
// Clear media (no FK to posts, not useful in daily segment)
// Clear media (no FK to posts, can't prune until post_id migration lands)
slog.Info("clearing media table")
result, err = db.Exec("DELETE FROM media")
result, err = tx.Exec("DELETE FROM media")
if err != nil {
return fmt.Errorf("deleting media: %w", err)
}
@ -80,13 +106,17 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
// Clean up orphaned users
slog.Info("cleaning orphaned users")
result, err = db.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)")
result, err = tx.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)")
if err != nil {
return fmt.Errorf("deleting orphaned users: %w", err)
}
deleted, _ = result.RowsAffected()
slog.Info("deleted orphaned users", "count", deleted)
if err := tx.Commit(); err != nil {
return fmt.Errorf("committing transaction: %w", err)
}
// Verify remaining count
var remaining int64
if err := db.QueryRow("SELECT COUNT(*) FROM posts").Scan(&remaining); err != nil {
@ -98,7 +128,7 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
slog.Info("posts remaining after prune", "count", remaining)
// VACUUM to reclaim space
slog.Info("running VACUUM (this may take a while)")
slog.Info("running VACUUM")
if _, err := db.Exec("VACUUM"); err != nil {
return fmt.Errorf("vacuum: %w", err)
}