diff options
author | Neil Alexander <neilalexander@users.noreply.github.com> | 2022-01-27 14:29:14 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-27 14:29:14 +0000 |
commit | a763cbb0e1a12828dade855add9a6c30c784baa8 (patch) | |
tree | 6d923ee79951fd8b70b6941b86155c9db6294c8f /roomserver/internal/input/input.go | |
parent | 5b4999afa9cb095eb5b4c8c163d4063078477baa (diff) |
Roomserver/federation input refactor (#2104)
* Put federation client functions into their own file
* Look for missing auth events in RS input
* Remove retrieveMissingAuthEvents from federation API
* Logging
* Sorta transplanted the code over
* Use event origin failing all else
* Don't get stuck on mutexes:
* Add verifier
* Don't mark state events with zero snapshot NID as not existing
* Check missing state if not an outlier before storing the event
* Reject instead of soft-fail, don't copy roominfo so much
* Use synchronous contexts, limit time to fetch missing events
* Clean up some commented out bits
* Simplify `/send` endpoint significantly
* Submit async
* Report errors on sending to RS input
* Set max payload in NATS to 16MB
* Tweak metrics
* Add `workerForRoom` for tidiness
* Try skipping unmarshalling errors for RespMissingEvents
* Track missing prev events separately to avoid calculating state when not possible
* Tweak logic around checking missing state
* Care about state when checking missing prev events
* Don't check missing state for create events
* Try that again
* Handle create events better
* Send create room events as new
* Use given event kind when sending auth/state events
* Revert "Use given event kind when sending auth/state events"
This reverts commit 089d64d271b5fca8c104e1554711187420dbebca.
* Only search for missing prev events or state for new events
* Tweaks
* We only have missing prev if we don't supply state
* Room version tweaks
* Allow async inputs again
* Apply backpressure to consumers/synchronous requests to hopefully stop things being overwhelmed
* Set timeouts on roomserver input tasks (need to decide what timeout makes sense)
* Use work queue policy, deliver all on restart
* Reduce chance of duplicates being sent by NATS
* Limit the number of servers we attempt to reduce backpressure
* Some review comment fixes
* Tidy up a couple things
* Don't limit servers, randomise order using map
* Some context refactoring
* Update gmsl
* Don't resend create events
* Set stateIDs length correctly or else the roomserver thinks there are missing events when there aren't
* Exclude our own servername
* Try backing off servers
* Make excluding self behaviour optional
* Exclude self from g_m_e
* Update sytest-whitelist
* Update consumers for the roomserver output stream
* Remember to send outliers for state returned from /gme
* Make full HTTP tests less upsetti
* Remove 'If a device list update goes missing, the server resyncs on the next one' from the sytest blacklist
* Remove debugging test
* Fix blacklist again, remove unnecessary duplicate context
* Clearer contexts, don't use background in case there's something happening there
* Don't queue up events more than once in memory
* Correctly identify create events when checking for state
* Fill in gaps again in /gme code
* Remove `AuthEventIDs` from `InputRoomEvent`
* Remove stray field
Co-authored-by: Kegan Dougal <kegan@matrix.org>
Diffstat (limited to 'roomserver/internal/input/input.go')
-rw-r--r-- | roomserver/internal/input/input.go | 63 |
1 files changed, 53 insertions, 10 deletions
diff --git a/roomserver/internal/input/input.go b/roomserver/internal/input/input.go index 57e51055..9601e018 100644 --- a/roomserver/internal/input/input.go +++ b/roomserver/internal/input/input.go @@ -19,12 +19,15 @@ import ( "context" "encoding/json" "sync" + "time" "github.com/Arceliar/phony" "github.com/getsentry/sentry-go" + fedapi "github.com/matrix-org/dendrite/federationapi/api" "github.com/matrix-org/dendrite/internal/hooks" "github.com/matrix-org/dendrite/roomserver/acls" "github.com/matrix-org/dendrite/roomserver/api" + "github.com/matrix-org/dendrite/roomserver/internal/query" "github.com/matrix-org/dendrite/roomserver/storage" "github.com/matrix-org/dendrite/setup/jetstream" "github.com/matrix-org/gomatrixserverlib" @@ -45,12 +48,28 @@ type Inputer struct { JetStream nats.JetStreamContext Durable nats.SubOpt ServerName gomatrixserverlib.ServerName + FSAPI fedapi.FederationInternalAPI + KeyRing gomatrixserverlib.JSONVerifier ACLs *acls.ServerACLs InputRoomEventTopic string OutputRoomEventTopic string workers sync.Map // room ID -> *phony.Inbox + + Queryer *query.Queryer +} + +func (r *Inputer) workerForRoom(roomID string) *phony.Inbox { + inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + return inbox.(*phony.Inbox) } +// eventsInProgress is an in-memory map to keep a track of which events we have +// queued up for processing. If we get a redelivery from NATS and we still have +// the queued up item then we won't do anything with the redelivered message. If +// we've restarted Dendrite and now this map is empty then it means that we will +// reload pending work from NATS. +var eventsInProgress sync.Map + // onMessage is called when a new event arrives in the roomserver input stream. func (r *Inputer) Start() error { _, err := r.JetStream.Subscribe( @@ -65,11 +84,23 @@ func (r *Inputer) Start() error { _ = msg.Term() return } - inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + + _ = msg.InProgress() + index := roomID + "\000" + inputRoomEvent.Event.EventID() + if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok { + // We're already waiting to deal with this event, so there's no + // point in queuing it up again. We've notified NATS that we're + // working on the message still, so that will have deferred the + // redelivery by a bit. + return + } + roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc() - inbox.(*phony.Inbox).Act(nil, func() { + r.workerForRoom(roomID).Act(nil, func() { + _ = msg.InProgress() // resets the acknowledgement wait timer + defer eventsInProgress.Delete(index) defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec() - if err := r.processRoomEvent(context.TODO(), &inputRoomEvent); err != nil { + if err := r.processRoomEvent(context.Background(), &inputRoomEvent); err != nil { sentry.CaptureException(err) } else { hooks.Run(hooks.KindNewEventPersisted, inputRoomEvent.Event) @@ -82,12 +113,14 @@ func (r *Inputer) Start() error { // sure that we only acknowledge when we're happy we've done everything we // can. This ensures we retry things when it makes sense to do so. nats.ManualAck(), - // NATS will try to redeliver things to us automatically if we don't ack - // or nak them within a certain amount of time. This stops that from - // happening, so we don't end up doing a lot of unnecessary duplicate work. - nats.MaxDeliver(0), // Use a durable named consumer. r.Durable, + // If we've missed things in the stream, e.g. we restarted, then replay + // all of the queued messages that were waiting for us. + nats.DeliverAll(), + // Ensure that NATS doesn't try to resend us something that wasn't done + // within the period of time that we might still be processing it. + nats.AckWait(MaximumProcessingTime+(time.Second*10)), ) return err } @@ -122,11 +155,20 @@ func (r *Inputer) InputRoomEvents( for _, e := range request.InputRoomEvents { inputRoomEvent := e roomID := inputRoomEvent.Event.RoomID() - inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + index := roomID + "\000" + inputRoomEvent.Event.EventID() + if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok { + // We're already waiting to deal with this event, so there's no + // point in queuing it up again. We've notified NATS that we're + // working on the message still, so that will have deferred the + // redelivery by a bit. + return + } roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc() - inbox.(*phony.Inbox).Act(nil, func() { + worker := r.workerForRoom(roomID) + worker.Act(nil, func() { + defer eventsInProgress.Delete(index) defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec() - err := r.processRoomEvent(context.TODO(), &inputRoomEvent) + err := r.processRoomEvent(ctx, &inputRoomEvent) if err != nil { sentry.CaptureException(err) } else { @@ -142,6 +184,7 @@ func (r *Inputer) InputRoomEvents( for i := 0; i < len(request.InputRoomEvents); i++ { select { case <-ctx.Done(): + response.ErrMsg = context.DeadlineExceeded.Error() return case err := <-responses: if err != nil { |