speed up database pruning for throwaway copies
Disable crash-safety pragmas (synchronous=OFF, journal_mode=MEMORY), use a 200MB page cache, disable foreign keys to avoid per-row CASCADE overhead, bulk-delete junction tables explicitly, and wrap all deletes in a single transaction.
This commit is contained in:
parent
c31484f0cf
commit
3c890f0b83
@ -16,7 +16,9 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
|
||||
|
||||
slog.Info("pruning database", "keep_from", dayStart, "keep_until", dayEnd)
|
||||
|
||||
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(WAL)&_pragma=foreign_keys(ON)")
|
||||
// No foreign_keys — we handle junction tables manually to avoid per-row CASCADE overhead.
|
||||
// journal_mode=MEMORY and synchronous=OFF are safe because this is a throwaway copy.
|
||||
db, err := sql.Open("sqlite", dbPath+"?_pragma=journal_mode(MEMORY)&_pragma=synchronous(OFF)&_pragma=cache_size(-200000)")
|
||||
if err != nil {
|
||||
return fmt.Errorf("opening database: %w", err)
|
||||
}
|
||||
@ -42,18 +44,42 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
|
||||
targetDay.Format("2006-01-02"))
|
||||
}
|
||||
|
||||
// Delete posts outside target day (CASCADE handles junction tables)
|
||||
tx, err := db.Begin()
|
||||
if err != nil {
|
||||
return fmt.Errorf("beginning transaction: %w", err)
|
||||
}
|
||||
defer tx.Rollback()
|
||||
|
||||
// Delete junction table rows for posts outside target day (bulk, no CASCADE overhead)
|
||||
slog.Info("deleting junction table rows for non-target posts")
|
||||
keepSubquery := "SELECT id FROM posts WHERE timestamp >= ? AND timestamp < ?"
|
||||
|
||||
result, err := tx.Exec("DELETE FROM posts_hashtags WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting posts_hashtags: %w", err)
|
||||
}
|
||||
deleted, _ := result.RowsAffected()
|
||||
slog.Info("deleted posts_hashtags rows", "count", deleted)
|
||||
|
||||
result, err = tx.Exec("DELETE FROM posts_urls WHERE post_id NOT IN ("+keepSubquery+")", dayStart, dayEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting posts_urls: %w", err)
|
||||
}
|
||||
deleted, _ = result.RowsAffected()
|
||||
slog.Info("deleted posts_urls rows", "count", deleted)
|
||||
|
||||
// Delete posts outside target day
|
||||
slog.Info("deleting posts outside target day")
|
||||
result, err := db.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd)
|
||||
result, err = tx.Exec("DELETE FROM posts WHERE timestamp < ? OR timestamp >= ?", dayStart, dayEnd)
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting posts: %w", err)
|
||||
}
|
||||
deleted, _ := result.RowsAffected()
|
||||
deleted, _ = result.RowsAffected()
|
||||
slog.Info("deleted posts", "count", deleted)
|
||||
|
||||
// Clean up orphaned hashtags
|
||||
slog.Info("cleaning orphaned hashtags")
|
||||
result, err = db.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)")
|
||||
result, err = tx.Exec("DELETE FROM hashtags WHERE id NOT IN (SELECT DISTINCT hashtag_id FROM posts_hashtags)")
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting orphaned hashtags: %w", err)
|
||||
}
|
||||
@ -62,16 +88,16 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
|
||||
|
||||
// Clean up orphaned urls
|
||||
slog.Info("cleaning orphaned urls")
|
||||
result, err = db.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)")
|
||||
result, err = tx.Exec("DELETE FROM urls WHERE id NOT IN (SELECT DISTINCT url_id FROM posts_urls)")
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting orphaned urls: %w", err)
|
||||
}
|
||||
deleted, _ = result.RowsAffected()
|
||||
slog.Info("deleted orphaned urls", "count", deleted)
|
||||
|
||||
// Clear media (no FK to posts, not useful in daily segment)
|
||||
// Clear media (no FK to posts, can't prune until post_id migration lands)
|
||||
slog.Info("clearing media table")
|
||||
result, err = db.Exec("DELETE FROM media")
|
||||
result, err = tx.Exec("DELETE FROM media")
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting media: %w", err)
|
||||
}
|
||||
@ -80,13 +106,17 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
|
||||
|
||||
// Clean up orphaned users
|
||||
slog.Info("cleaning orphaned users")
|
||||
result, err = db.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)")
|
||||
result, err = tx.Exec("DELETE FROM users WHERE did NOT IN (SELECT DISTINCT user_did FROM posts)")
|
||||
if err != nil {
|
||||
return fmt.Errorf("deleting orphaned users: %w", err)
|
||||
}
|
||||
deleted, _ = result.RowsAffected()
|
||||
slog.Info("deleted orphaned users", "count", deleted)
|
||||
|
||||
if err := tx.Commit(); err != nil {
|
||||
return fmt.Errorf("committing transaction: %w", err)
|
||||
}
|
||||
|
||||
// Verify remaining count
|
||||
var remaining int64
|
||||
if err := db.QueryRow("SELECT COUNT(*) FROM posts").Scan(&remaining); err != nil {
|
||||
@ -98,7 +128,7 @@ func PruneDatabase(dbPath string, targetDay time.Time) error {
|
||||
slog.Info("posts remaining after prune", "count", remaining)
|
||||
|
||||
// VACUUM to reclaim space
|
||||
slog.Info("running VACUUM (this may take a while)")
|
||||
slog.Info("running VACUUM")
|
||||
if _, err := db.Exec("VACUUM"); err != nil {
|
||||
return fmt.Errorf("vacuum: %w", err)
|
||||
}
|
||||
|
||||
Loading…
Reference in New Issue
Block a user