Add periodic WAL checkpointing to fix slow queries

The WAL file was growing to 700MB+ which caused COUNT(*) queries to
timeout. Reads must scan the WAL to find current page versions, and
a large WAL makes this slow.

Add Checkpoint method to database interface and run PASSIVE checkpoints
every 30 seconds via the DBMaintainer. This keeps the WAL small and
maintains fast read performance under heavy write load.
This commit is contained in:
Jeffrey Paul 2026-01-01 05:42:03 -08:00
parent c6fa2b0fbd
commit 8f524485f7
4 changed files with 65 additions and 7 deletions

View File

@ -1986,3 +1986,14 @@ func (d *Database) Analyze(ctx context.Context) error {
return nil
}
// Checkpoint runs a WAL checkpoint to transfer data from the WAL to the main database.
// Uses PASSIVE mode which doesn't block writers but may not checkpoint all frames.
func (d *Database) Checkpoint(ctx context.Context) error {
_, err := d.db.ExecContext(ctx, "PRAGMA wal_checkpoint(PASSIVE)")
if err != nil {
return fmt.Errorf("failed to checkpoint WAL: %w", err)
}
return nil
}

View File

@ -87,6 +87,7 @@ type Store interface {
// Maintenance operations
Vacuum(ctx context.Context) error
Analyze(ctx context.Context) error
Checkpoint(ctx context.Context) error
}
// Ensure Database implements Store

View File

@ -415,6 +415,11 @@ func (m *mockStore) Analyze(ctx context.Context) error {
return nil
}
// Checkpoint mock implementation
func (m *mockStore) Checkpoint(ctx context.Context) error {
return nil
}
func TestRouteWatchLiveFeed(t *testing.T) {
// Create mock database

View File

@ -12,6 +12,10 @@ import (
// Database maintenance configuration constants.
const (
// checkpointInterval is how often to run WAL checkpoint.
// Frequent checkpoints keep the WAL small, improving read performance.
checkpointInterval = 30 * time.Second
// vacuumInterval is how often to run incremental vacuum.
// Since incremental vacuum only frees ~1000 pages (~4MB) per run,
// we run it frequently to keep up with deletions.
@ -20,6 +24,9 @@ const (
// analyzeInterval is how often to run ANALYZE.
analyzeInterval = 1 * time.Hour
// checkpointTimeout is the max time for WAL checkpoint.
checkpointTimeout = 10 * time.Second
// vacuumTimeout is the max time for incremental vacuum (should be quick).
vacuumTimeout = 30 * time.Second
@ -35,13 +42,16 @@ type DBMaintainer struct {
wg sync.WaitGroup
// Stats tracking
statsMu sync.Mutex
lastVacuum time.Time
lastAnalyze time.Time
vacuumCount int
analyzeCount int
lastVacuumError error
lastAnalyzeError error
statsMu sync.Mutex
lastCheckpoint time.Time
lastVacuum time.Time
lastAnalyze time.Time
checkpointCount int
vacuumCount int
analyzeCount int
lastCheckpointError error
lastVacuumError error
lastAnalyzeError error
}
// NewDBMaintainer creates a new database maintainer.
@ -58,6 +68,7 @@ func (m *DBMaintainer) Start() {
m.wg.Add(1)
go m.run()
m.logger.Info("Database maintainer started",
"checkpoint_interval", checkpointInterval,
"vacuum_interval", vacuumInterval,
"analyze_interval", analyzeInterval,
)
@ -75,8 +86,10 @@ func (m *DBMaintainer) run() {
defer m.wg.Done()
// Use different timers for each task
checkpointTimer := time.NewTimer(checkpointInterval)
vacuumTimer := time.NewTimer(vacuumInterval)
analyzeTimer := time.NewTimer(analyzeInterval)
defer checkpointTimer.Stop()
defer vacuumTimer.Stop()
defer analyzeTimer.Stop()
@ -85,6 +98,10 @@ func (m *DBMaintainer) run() {
case <-m.stopCh:
return
case <-checkpointTimer.C:
m.runCheckpoint()
checkpointTimer.Reset(checkpointInterval)
case <-vacuumTimer.C:
m.runVacuum()
vacuumTimer.Reset(vacuumInterval)
@ -96,6 +113,30 @@ func (m *DBMaintainer) run() {
}
}
// runCheckpoint performs a WAL checkpoint to keep the WAL file small.
func (m *DBMaintainer) runCheckpoint() {
ctx, cancel := context.WithTimeout(context.Background(), checkpointTimeout)
defer cancel()
startTime := time.Now()
err := m.db.Checkpoint(ctx)
m.statsMu.Lock()
m.lastCheckpoint = time.Now()
m.lastCheckpointError = err
if err == nil {
m.checkpointCount++
}
m.statsMu.Unlock()
if err != nil {
m.logger.Error("WAL checkpoint failed", "error", err, "duration", time.Since(startTime))
} else {
m.logger.Debug("WAL checkpoint completed", "duration", time.Since(startTime))
}
}
// runVacuum performs an incremental vacuum operation on the database.
func (m *DBMaintainer) runVacuum() {
ctx, cancel := context.WithTimeout(context.Background(), vacuumTimeout)