package chunker import ( "crypto/sha256" "encoding/hex" "fmt" "io" "os" "github.com/jotfs/fastcdc-go" ) // Chunk represents a single chunk of data type Chunk struct { Hash string // Content hash of the chunk Data []byte // Chunk data Offset int64 // Offset in the original file Size int64 // Size of the chunk } // Chunker provides content-defined chunking using FastCDC type Chunker struct { avgChunkSize int minChunkSize int maxChunkSize int } // NewChunker creates a new chunker with the specified average chunk size func NewChunker(avgChunkSize int64) *Chunker { // FastCDC recommends min = avg/4 and max = avg*4 return &Chunker{ avgChunkSize: int(avgChunkSize), minChunkSize: int(avgChunkSize / 4), maxChunkSize: int(avgChunkSize * 4), } } // ChunkReader splits the reader into content-defined chunks func (c *Chunker) ChunkReader(r io.Reader) ([]Chunk, error) { opts := fastcdc.Options{ MinSize: c.minChunkSize, AverageSize: c.avgChunkSize, MaxSize: c.maxChunkSize, } chunker, err := fastcdc.NewChunker(r, opts) if err != nil { return nil, fmt.Errorf("creating chunker: %w", err) } var chunks []Chunk offset := int64(0) for { chunk, err := chunker.Next() if err == io.EOF { break } if err != nil { return nil, fmt.Errorf("reading chunk: %w", err) } // Calculate hash hash := sha256.Sum256(chunk.Data) // Make a copy of the data since FastCDC reuses the buffer chunkData := make([]byte, len(chunk.Data)) copy(chunkData, chunk.Data) chunks = append(chunks, Chunk{ Hash: hex.EncodeToString(hash[:]), Data: chunkData, Offset: offset, Size: int64(len(chunk.Data)), }) offset += int64(len(chunk.Data)) } return chunks, nil } // ChunkCallback is called for each chunk as it's processed type ChunkCallback func(chunk Chunk) error // ChunkReaderStreaming splits the reader into chunks and calls the callback for each func (c *Chunker) ChunkReaderStreaming(r io.Reader, callback ChunkCallback) error { opts := fastcdc.Options{ MinSize: c.minChunkSize, AverageSize: c.avgChunkSize, MaxSize: c.maxChunkSize, } chunker, err := fastcdc.NewChunker(r, opts) if err != nil { return fmt.Errorf("creating chunker: %w", err) } offset := int64(0) for { chunk, err := chunker.Next() if err == io.EOF { break } if err != nil { return fmt.Errorf("reading chunk: %w", err) } // Calculate hash hash := sha256.Sum256(chunk.Data) // Make a copy of the data since FastCDC reuses the buffer chunkData := make([]byte, len(chunk.Data)) copy(chunkData, chunk.Data) if err := callback(Chunk{ Hash: hex.EncodeToString(hash[:]), Data: chunkData, Offset: offset, Size: int64(len(chunk.Data)), }); err != nil { return fmt.Errorf("callback error: %w", err) } offset += int64(len(chunk.Data)) } return nil } // ChunkFile splits a file into content-defined chunks func (c *Chunker) ChunkFile(path string) ([]Chunk, error) { file, err := os.Open(path) if err != nil { return nil, fmt.Errorf("opening file: %w", err) } defer func() { if err := file.Close(); err != nil && err.Error() != "invalid argument" { // Log error or handle as needed _ = err } }() return c.ChunkReader(file) }