diff --git a/Makefile b/Makefile index f3a44df..6db7a3b 100644 --- a/Makefile +++ b/Makefile @@ -42,7 +42,7 @@ build: ./$(FN) touch .lintsetup lint: fmt .lintsetup - fgt golint + fgt golint ./... go-get: go get -v diff --git a/cmd/feta/main.go b/cmd/feta/main.go index 14ff4b3..1eeeefc 100644 --- a/cmd/feta/main.go +++ b/cmd/feta/main.go @@ -5,7 +5,11 @@ import "os" import "github.com/sneak/feta" // these are filled in at link-time by the build scripts + +// Version is the git version of the app var Version string + +// Buildarch contains the architecture it is compiled for var Buildarch string func main() { diff --git a/ingester/ingester.go b/ingester/ingester.go index d9ca2cb..2800dad 100644 --- a/ingester/ingester.go +++ b/ingester/ingester.go @@ -5,30 +5,52 @@ import "github.com/rs/zerolog/log" import "github.com/sneak/feta/toot" import "github.com/sneak/feta/storage" +// TootIngester is the data structure for the ingester process that is +// responsible for storing the discovered toots type TootIngester struct { inbound chan *toot.Toot recentlySeen []*seenTootMemo - storageBackend *storage.TootStorageBackend + storageBackend storage.TootStorageBackend } type seenTootMemo struct { lastSeen time.Time - tootHash toot.TootHash + tootHash toot.Hash } +// NewTootIngester returns a fresh TootIngester for your use func NewTootIngester() *TootIngester { ti := new(TootIngester) - ti.inbound = make(chan *toot.Toot, 1) + ti.inbound = make(chan *toot.Toot, 10000) return ti } +// SetStorageBackend takes a type conforming to TootStorageBackend for +// persisting toots somewhere/somehow +func (ti *TootIngester) SetStorageBackend(be storage.TootStorageBackend) { + ti.storageBackend = be +} + +// GetDeliveryChannel returns a channel that receives pointers to toots +// which the ingester will dedupe and store func (ti *TootIngester) GetDeliveryChannel() chan *toot.Toot { return ti.inbound } +// Ingest is the main entrypoint for the TootIngester goroutine func (ti *TootIngester) Ingest() { log.Info().Msg("TootIngester starting") + go ti.readFromInboundChannel() +} + +func (ti *TootIngester) readFromInboundChannel() { for { - time.Sleep(1 * time.Second) // FIXME do something + nt := <-ti.inbound + go ti.storeToot(nt) } } + +func (ti *TootIngester) storeToot(t *toot.Toot) { + // FIXME first check for dupes in recentlySeen + ti.storageBackend.StoreToot(*t) +} diff --git a/instance.go b/instance.go index ffd5fcc..75dc179 100644 --- a/instance.go +++ b/instance.go @@ -441,7 +441,7 @@ func (i *instance) fetchRecentToots() error { log.Info(). Str("hostname", i.hostname). - Int("tootCount", len(*tc)). + Int("tootCount", len(tc)). Msgf("got and parsed toots") i.registerSuccess() i.Event("TOOTS_FETCHED") diff --git a/jsonapis/structures.go b/jsonapis/structures.go index b436868..96c4110 100644 --- a/jsonapis/structures.go +++ b/jsonapis/structures.go @@ -4,6 +4,9 @@ import "time" // thank fuck for https://mholt.github.io/json-to-go/ otherwise // this would have been a giant pain in the dick + +// MastodonIndexResponse is the json api shape from the mastodon instance +// indexer type MastodonIndexResponse struct { Instances []struct { ID string `json:"_id"` @@ -48,6 +51,8 @@ type MastodonIndexResponse struct { } `json:"instances"` } +// PleromaIndexResponse is the json api shape from the pleroma instance +// indexer type PleromaIndexResponse []struct { Domain string `json:"domain"` Title string `json:"title"` @@ -62,6 +67,7 @@ type PleromaIndexResponse []struct { TextLimit int `json:"text_limit"` } +// NodeInfoVersionTwoSchema is the json format of nodeinfo 2.0 type NodeInfoVersionTwoSchema struct { Version string `json:"version"` Software struct { @@ -80,6 +86,7 @@ type NodeInfoVersionTwoSchema struct { OpenRegistrations bool `json:"openRegistrations"` } +// NodeInfoWellKnownResponse is the json format of the nodeinfo schema type NodeInfoWellKnownResponse struct { Links []struct { Rel string `json:"rel"` @@ -87,6 +94,10 @@ type NodeInfoWellKnownResponse struct { } `json:"links"` } +// APISerializedToot is a partial shape of the json serialized form of a +// toot from the mastodon api (also used by pleroma). We save the original +// json from the server though so this is just a minimal subset that we need +// to deserialize for purposes of this indexer app. type APISerializedToot struct { Account struct { Acct string `json:"acct"` diff --git a/seeds/seeds.go b/seeds/seeds.go index d532aba..509c469 100644 --- a/seeds/seeds.go +++ b/seeds/seeds.go @@ -1,5 +1,10 @@ package seeds +// SeedInstances is a list of instance hostnames used to seed the indexer. +// This list so far is a bunch of instances that have been +// banned/defederated by others so it's important to seed them so that we +// can always get their toots for archiving; they will likely not appear in +// common mentions/public indices. var SeedInstances = [...]string{ "splat.soy", "veenus.art", diff --git a/storage/tootstore.go b/storage/tootstore.go index 668285e..1a0c143 100644 --- a/storage/tootstore.go +++ b/storage/tootstore.go @@ -8,22 +8,29 @@ import "sync" import "github.com/sneak/feta/toot" +// TootStorageBackend is the interface to which storage backends must +// conform for storing toots type TootStorageBackend interface { TootExists(t toot.Toot) bool StoreToot(t toot.Toot) error StoreToots(tc []*toot.Toot) error } +// TootFSStorage is a TootStorageBackend that writes to the local +// filesystem. type TootFSStorage struct { root string } +// NewTootFSStorage returns a *TootFSStorage for writing toots to the +// local filesystem func NewTootFSStorage(root string) *TootFSStorage { ts := new(TootFSStorage) ts.root = root return ts } +// StoreToots writes a slice of pointers to toots to disk func (ts *TootFSStorage) StoreToots(tc []*toot.Toot) error { var returnErrors []string for _, item := range tc { @@ -39,6 +46,9 @@ func (ts *TootFSStorage) StoreToots(tc []*toot.Toot) error { return errors.New(strings.Join(returnErrors, "; ")) } +// TootExists checks to see if we have already written a toot to disk or +// not. Note that the ingester de-dupes with a table in memory so that this +// will only really get used on app restarts func (ts *TootFSStorage) TootExists(t toot.Toot) bool { path := t.DiskStoragePath() full := ts.root + "/" + path @@ -49,39 +59,45 @@ func (ts *TootFSStorage) TootExists(t toot.Toot) bool { return true } +// StoreToot writes a single toot to disk func (ts *TootFSStorage) StoreToot(t toot.Toot) error { path := t.DiskStoragePath() full := ts.root + "/" + path return ioutil.WriteFile(full, t.Original, 0644) } +// TootMemoryStorage is a TootStorageBackend that just stores all ingested +// toots in ram forever until the computer fills up and catches fire and explodes type TootMemoryStorage struct { sync.Mutex - toots map[toot.TootHash]toot.Toot + toots map[toot.Hash]toot.Toot //maxSize uint // FIXME support eviction } +// NewTootMemoryStorage returns a *TootMemoryStorage for storing toots in +// ram forever func NewTootMemoryStorage() *TootMemoryStorage { ts := new(TootMemoryStorage) - ts.toots = make(map[toot.TootHash]toot.Toot) + ts.toots = make(map[toot.Hash]toot.Toot) return ts } +// StoreToot saves a single toot into an in-memory hashtable func (ts *TootMemoryStorage) StoreToot(t toot.Toot) { - th := t.Hash - if ts.TootExists(th) { + if ts.TootExists(t) { return } ts.Lock() defer ts.Unlock() - ts.toots[th] = t + ts.toots[t.Hash] = t return } -func (ts *TootMemoryStorage) TootExists(th toot.TootHash) bool { +// TootExists checks to see if we have a toot in memory already +func (ts *TootMemoryStorage) TootExists(t toot.Toot) bool { ts.Lock() defer ts.Unlock() - if _, ok := ts.toots[th]; ok { //this syntax is so gross + if _, ok := ts.toots[t.Hash]; ok { //this syntax is so gross return true } return false diff --git a/toot/toot.go b/toot/toot.go index 8158fbc..83a1e08 100644 --- a/toot/toot.go +++ b/toot/toot.go @@ -5,30 +5,37 @@ import "encoding/json" import "errors" import "strings" import "github.com/sneak/feta/jsonapis" -import "github.com/davecgh/go-spew/spew" + +//import "github.com/davecgh/go-spew/spew" import "github.com/rs/zerolog/log" //import "encoding/hex" import mh "github.com/multiformats/go-multihash" import mhopts "github.com/multiformats/go-multihash/opts" -type TootHash string +// Hash is a type for storing a string-based base58 multihash of a +// toot's identity +type Hash string +// Toot is an object we use internally for storing a discovered toot type Toot struct { Original []byte Parsed *jsonapis.APISerializedToot - Hash TootHash + Hash Hash FromHost string } -func NewTootCollectionFromMastodonAPIResponse(in []byte, hostname string) (*[]Toot, error) { +// NewTootCollectionFromMastodonAPIResponse takes a byte array from a masto +// api response and provides you with a nice array of pointers to parsed +// toots +func NewTootCollectionFromMastodonAPIResponse(in []byte, hostname string) ([]*Toot, error) { var rt []json.RawMessage err := json.Unmarshal(in, &rt) if err != nil { return nil, errors.New("unable to parse api response") } - var tc []Toot + var tc []*Toot // iterate over rawtoots from api for _, item := range rt { @@ -47,11 +54,9 @@ func NewTootCollectionFromMastodonAPIResponse(in []byte, hostname string) (*[]To t.Original = o t.FromHost = hostname t.calcHash() - tc = append(tc, *t) + tc = append(tc, t) } - spew.Dump(tc) - panic("") - return &tc, nil + return tc, nil } func (t *Toot) String() string { @@ -76,6 +81,11 @@ func (t *Toot) multiHash(in []byte) string { return h.B58String() } +// DiskStoragePath is a helper function on a Toot that allows it to provide +// a storage path on disk. This should probably be moved into the FSStorage +// backend instead. FIXME +// It's here because it's a pure function that just formats its own toot attributes +// into a string. func (t *Toot) DiskStoragePath() string { // FIXME make this error if fields are missing // '/YYYYMMDD/example.com/username/YYYY-MM-DD.HHMMSS.username@fromHost.multihash.json' @@ -103,5 +113,5 @@ func (t *Toot) identityHashInput() string { func (t *Toot) calcHash() { hi := t.identityHashInput() - t.Hash = TootHash(t.multiHash([]byte(hi))) + t.Hash = Hash(t.multiHash([]byte(hi))) }