aboutsummaryrefslogtreecommitdiff
path: root/roomserver/internal/input/input.go
diff options
context:
space:
mode:
authorNeil Alexander <neilalexander@users.noreply.github.com>2022-01-27 14:29:14 +0000
committerGitHub <noreply@github.com>2022-01-27 14:29:14 +0000
commita763cbb0e1a12828dade855add9a6c30c784baa8 (patch)
tree6d923ee79951fd8b70b6941b86155c9db6294c8f /roomserver/internal/input/input.go
parent5b4999afa9cb095eb5b4c8c163d4063078477baa (diff)
Roomserver/federation input refactor (#2104)
* Put federation client functions into their own file * Look for missing auth events in RS input * Remove retrieveMissingAuthEvents from federation API * Logging * Sorta transplanted the code over * Use event origin failing all else * Don't get stuck on mutexes: * Add verifier * Don't mark state events with zero snapshot NID as not existing * Check missing state if not an outlier before storing the event * Reject instead of soft-fail, don't copy roominfo so much * Use synchronous contexts, limit time to fetch missing events * Clean up some commented out bits * Simplify `/send` endpoint significantly * Submit async * Report errors on sending to RS input * Set max payload in NATS to 16MB * Tweak metrics * Add `workerForRoom` for tidiness * Try skipping unmarshalling errors for RespMissingEvents * Track missing prev events separately to avoid calculating state when not possible * Tweak logic around checking missing state * Care about state when checking missing prev events * Don't check missing state for create events * Try that again * Handle create events better * Send create room events as new * Use given event kind when sending auth/state events * Revert "Use given event kind when sending auth/state events" This reverts commit 089d64d271b5fca8c104e1554711187420dbebca. * Only search for missing prev events or state for new events * Tweaks * We only have missing prev if we don't supply state * Room version tweaks * Allow async inputs again * Apply backpressure to consumers/synchronous requests to hopefully stop things being overwhelmed * Set timeouts on roomserver input tasks (need to decide what timeout makes sense) * Use work queue policy, deliver all on restart * Reduce chance of duplicates being sent by NATS * Limit the number of servers we attempt to reduce backpressure * Some review comment fixes * Tidy up a couple things * Don't limit servers, randomise order using map * Some context refactoring * Update gmsl * Don't resend create events * Set stateIDs length correctly or else the roomserver thinks there are missing events when there aren't * Exclude our own servername * Try backing off servers * Make excluding self behaviour optional * Exclude self from g_m_e * Update sytest-whitelist * Update consumers for the roomserver output stream * Remember to send outliers for state returned from /gme * Make full HTTP tests less upsetti * Remove 'If a device list update goes missing, the server resyncs on the next one' from the sytest blacklist * Remove debugging test * Fix blacklist again, remove unnecessary duplicate context * Clearer contexts, don't use background in case there's something happening there * Don't queue up events more than once in memory * Correctly identify create events when checking for state * Fill in gaps again in /gme code * Remove `AuthEventIDs` from `InputRoomEvent` * Remove stray field Co-authored-by: Kegan Dougal <kegan@matrix.org>
Diffstat (limited to 'roomserver/internal/input/input.go')
-rw-r--r--roomserver/internal/input/input.go63
1 files changed, 53 insertions, 10 deletions
diff --git a/roomserver/internal/input/input.go b/roomserver/internal/input/input.go
index 57e51055..9601e018 100644
--- a/roomserver/internal/input/input.go
+++ b/roomserver/internal/input/input.go
@@ -19,12 +19,15 @@ import (
"context"
"encoding/json"
"sync"
+ "time"
"github.com/Arceliar/phony"
"github.com/getsentry/sentry-go"
+ fedapi "github.com/matrix-org/dendrite/federationapi/api"
"github.com/matrix-org/dendrite/internal/hooks"
"github.com/matrix-org/dendrite/roomserver/acls"
"github.com/matrix-org/dendrite/roomserver/api"
+ "github.com/matrix-org/dendrite/roomserver/internal/query"
"github.com/matrix-org/dendrite/roomserver/storage"
"github.com/matrix-org/dendrite/setup/jetstream"
"github.com/matrix-org/gomatrixserverlib"
@@ -45,12 +48,28 @@ type Inputer struct {
JetStream nats.JetStreamContext
Durable nats.SubOpt
ServerName gomatrixserverlib.ServerName
+ FSAPI fedapi.FederationInternalAPI
+ KeyRing gomatrixserverlib.JSONVerifier
ACLs *acls.ServerACLs
InputRoomEventTopic string
OutputRoomEventTopic string
workers sync.Map // room ID -> *phony.Inbox
+
+ Queryer *query.Queryer
+}
+
+func (r *Inputer) workerForRoom(roomID string) *phony.Inbox {
+ inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{})
+ return inbox.(*phony.Inbox)
}
+// eventsInProgress is an in-memory map to keep a track of which events we have
+// queued up for processing. If we get a redelivery from NATS and we still have
+// the queued up item then we won't do anything with the redelivered message. If
+// we've restarted Dendrite and now this map is empty then it means that we will
+// reload pending work from NATS.
+var eventsInProgress sync.Map
+
// onMessage is called when a new event arrives in the roomserver input stream.
func (r *Inputer) Start() error {
_, err := r.JetStream.Subscribe(
@@ -65,11 +84,23 @@ func (r *Inputer) Start() error {
_ = msg.Term()
return
}
- inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{})
+
+ _ = msg.InProgress()
+ index := roomID + "\000" + inputRoomEvent.Event.EventID()
+ if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok {
+ // We're already waiting to deal with this event, so there's no
+ // point in queuing it up again. We've notified NATS that we're
+ // working on the message still, so that will have deferred the
+ // redelivery by a bit.
+ return
+ }
+
roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc()
- inbox.(*phony.Inbox).Act(nil, func() {
+ r.workerForRoom(roomID).Act(nil, func() {
+ _ = msg.InProgress() // resets the acknowledgement wait timer
+ defer eventsInProgress.Delete(index)
defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec()
- if err := r.processRoomEvent(context.TODO(), &inputRoomEvent); err != nil {
+ if err := r.processRoomEvent(context.Background(), &inputRoomEvent); err != nil {
sentry.CaptureException(err)
} else {
hooks.Run(hooks.KindNewEventPersisted, inputRoomEvent.Event)
@@ -82,12 +113,14 @@ func (r *Inputer) Start() error {
// sure that we only acknowledge when we're happy we've done everything we
// can. This ensures we retry things when it makes sense to do so.
nats.ManualAck(),
- // NATS will try to redeliver things to us automatically if we don't ack
- // or nak them within a certain amount of time. This stops that from
- // happening, so we don't end up doing a lot of unnecessary duplicate work.
- nats.MaxDeliver(0),
// Use a durable named consumer.
r.Durable,
+ // If we've missed things in the stream, e.g. we restarted, then replay
+ // all of the queued messages that were waiting for us.
+ nats.DeliverAll(),
+ // Ensure that NATS doesn't try to resend us something that wasn't done
+ // within the period of time that we might still be processing it.
+ nats.AckWait(MaximumProcessingTime+(time.Second*10)),
)
return err
}
@@ -122,11 +155,20 @@ func (r *Inputer) InputRoomEvents(
for _, e := range request.InputRoomEvents {
inputRoomEvent := e
roomID := inputRoomEvent.Event.RoomID()
- inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{})
+ index := roomID + "\000" + inputRoomEvent.Event.EventID()
+ if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok {
+ // We're already waiting to deal with this event, so there's no
+ // point in queuing it up again. We've notified NATS that we're
+ // working on the message still, so that will have deferred the
+ // redelivery by a bit.
+ return
+ }
roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc()
- inbox.(*phony.Inbox).Act(nil, func() {
+ worker := r.workerForRoom(roomID)
+ worker.Act(nil, func() {
+ defer eventsInProgress.Delete(index)
defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec()
- err := r.processRoomEvent(context.TODO(), &inputRoomEvent)
+ err := r.processRoomEvent(ctx, &inputRoomEvent)
if err != nil {
sentry.CaptureException(err)
} else {
@@ -142,6 +184,7 @@ func (r *Inputer) InputRoomEvents(
for i := 0; i < len(request.InputRoomEvents); i++ {
select {
case <-ctx.Done():
+ response.ErrMsg = context.DeadlineExceeded.Error()
return
case err := <-responses:
if err != nil {