diff options
author | Neil Alexander <neilalexander@users.noreply.github.com> | 2022-01-27 14:29:14 +0000 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-01-27 14:29:14 +0000 |
commit | a763cbb0e1a12828dade855add9a6c30c784baa8 (patch) | |
tree | 6d923ee79951fd8b70b6941b86155c9db6294c8f /roomserver | |
parent | 5b4999afa9cb095eb5b4c8c163d4063078477baa (diff) |
Roomserver/federation input refactor (#2104)
* Put federation client functions into their own file
* Look for missing auth events in RS input
* Remove retrieveMissingAuthEvents from federation API
* Logging
* Sorta transplanted the code over
* Use event origin failing all else
* Don't get stuck on mutexes:
* Add verifier
* Don't mark state events with zero snapshot NID as not existing
* Check missing state if not an outlier before storing the event
* Reject instead of soft-fail, don't copy roominfo so much
* Use synchronous contexts, limit time to fetch missing events
* Clean up some commented out bits
* Simplify `/send` endpoint significantly
* Submit async
* Report errors on sending to RS input
* Set max payload in NATS to 16MB
* Tweak metrics
* Add `workerForRoom` for tidiness
* Try skipping unmarshalling errors for RespMissingEvents
* Track missing prev events separately to avoid calculating state when not possible
* Tweak logic around checking missing state
* Care about state when checking missing prev events
* Don't check missing state for create events
* Try that again
* Handle create events better
* Send create room events as new
* Use given event kind when sending auth/state events
* Revert "Use given event kind when sending auth/state events"
This reverts commit 089d64d271b5fca8c104e1554711187420dbebca.
* Only search for missing prev events or state for new events
* Tweaks
* We only have missing prev if we don't supply state
* Room version tweaks
* Allow async inputs again
* Apply backpressure to consumers/synchronous requests to hopefully stop things being overwhelmed
* Set timeouts on roomserver input tasks (need to decide what timeout makes sense)
* Use work queue policy, deliver all on restart
* Reduce chance of duplicates being sent by NATS
* Limit the number of servers we attempt to reduce backpressure
* Some review comment fixes
* Tidy up a couple things
* Don't limit servers, randomise order using map
* Some context refactoring
* Update gmsl
* Don't resend create events
* Set stateIDs length correctly or else the roomserver thinks there are missing events when there aren't
* Exclude our own servername
* Try backing off servers
* Make excluding self behaviour optional
* Exclude self from g_m_e
* Update sytest-whitelist
* Update consumers for the roomserver output stream
* Remember to send outliers for state returned from /gme
* Make full HTTP tests less upsetti
* Remove 'If a device list update goes missing, the server resyncs on the next one' from the sytest blacklist
* Remove debugging test
* Fix blacklist again, remove unnecessary duplicate context
* Clearer contexts, don't use background in case there's something happening there
* Don't queue up events more than once in memory
* Correctly identify create events when checking for state
* Fill in gaps again in /gme code
* Remove `AuthEventIDs` from `InputRoomEvent`
* Remove stray field
Co-authored-by: Kegan Dougal <kegan@matrix.org>
Diffstat (limited to 'roomserver')
-rw-r--r-- | roomserver/api/input.go | 8 | ||||
-rw-r--r-- | roomserver/api/wrapper.go | 13 | ||||
-rw-r--r-- | roomserver/internal/api.go | 29 | ||||
-rw-r--r-- | roomserver/internal/helpers/auth.go | 2 | ||||
-rw-r--r-- | roomserver/internal/helpers/helpers.go | 8 | ||||
-rw-r--r-- | roomserver/internal/input/input.go | 63 | ||||
-rw-r--r-- | roomserver/internal/input/input_events.go | 274 | ||||
-rw-r--r-- | roomserver/internal/input/input_latest_events.go | 2 | ||||
-rw-r--r-- | roomserver/internal/input/input_missing.go | 765 | ||||
-rw-r--r-- | roomserver/internal/perform/perform_backfill.go | 4 | ||||
-rw-r--r-- | roomserver/internal/perform/perform_inbound_peek.go | 2 | ||||
-rw-r--r-- | roomserver/internal/perform/perform_invite.go | 4 | ||||
-rw-r--r-- | roomserver/internal/perform/perform_join.go | 1 | ||||
-rw-r--r-- | roomserver/internal/perform/perform_leave.go | 2 | ||||
-rw-r--r-- | roomserver/internal/query/query.go | 12 | ||||
-rw-r--r-- | roomserver/state/state.go | 4 | ||||
-rw-r--r-- | roomserver/storage/postgres/events_table.go | 4 | ||||
-rw-r--r-- | roomserver/storage/sqlite3/events_table.go | 4 |
18 files changed, 1117 insertions, 84 deletions
diff --git a/roomserver/api/input.go b/roomserver/api/input.go index a537e64e..4b0704b9 100644 --- a/roomserver/api/input.go +++ b/roomserver/api/input.go @@ -54,12 +54,8 @@ type InputRoomEvent struct { Kind Kind `json:"kind"` // The event JSON for the event to add. Event *gomatrixserverlib.HeaderedEvent `json:"event"` - // List of state event IDs that authenticate this event. - // These are likely derived from the "auth_events" JSON key of the event. - // But can be different because the "auth_events" key can be incomplete or wrong. - // For example many matrix events forget to reference the m.room.create event even though it is needed for auth. - // (since synapse allows this to happen we have to allow it as well.) - AuthEventIDs []string `json:"auth_event_ids"` + // Which server told us about this event. + Origin gomatrixserverlib.ServerName `json:"origin"` // Whether the state is supplied as a list of event IDs or whether it // should be derived from the state at the previous events. HasState bool `json:"has_state"` diff --git a/roomserver/api/wrapper.go b/roomserver/api/wrapper.go index cdb186c0..e9b94e48 100644 --- a/roomserver/api/wrapper.go +++ b/roomserver/api/wrapper.go @@ -26,6 +26,7 @@ import ( func SendEvents( ctx context.Context, rsAPI RoomserverInternalAPI, kind Kind, events []*gomatrixserverlib.HeaderedEvent, + origin gomatrixserverlib.ServerName, sendAsServer gomatrixserverlib.ServerName, txnID *TransactionID, async bool, ) error { @@ -34,7 +35,7 @@ func SendEvents( ires[i] = InputRoomEvent{ Kind: kind, Event: event, - AuthEventIDs: event.AuthEventIDs(), + Origin: origin, SendAsServer: string(sendAsServer), TransactionID: txnID, } @@ -48,7 +49,7 @@ func SendEvents( func SendEventWithState( ctx context.Context, rsAPI RoomserverInternalAPI, kind Kind, state *gomatrixserverlib.RespState, event *gomatrixserverlib.HeaderedEvent, - haveEventIDs map[string]bool, async bool, + origin gomatrixserverlib.ServerName, haveEventIDs map[string]bool, async bool, ) error { outliers, err := state.Events() if err != nil { @@ -61,9 +62,9 @@ func SendEventWithState( continue } ires = append(ires, InputRoomEvent{ - Kind: KindOutlier, - Event: outlier.Headered(event.RoomVersion), - AuthEventIDs: outlier.AuthEventIDs(), + Kind: KindOutlier, + Event: outlier.Headered(event.RoomVersion), + Origin: origin, }) } @@ -75,7 +76,7 @@ func SendEventWithState( ires = append(ires, InputRoomEvent{ Kind: kind, Event: event, - AuthEventIDs: event.AuthEventIDs(), + Origin: origin, HasState: true, StateEventIDs: stateEventIDs, }) diff --git a/roomserver/internal/api.go b/roomserver/internal/api.go index cf2e59c6..5b87e623 100644 --- a/roomserver/internal/api.go +++ b/roomserver/internal/api.go @@ -37,8 +37,11 @@ type RoomserverInternalAPI struct { Cache caching.RoomServerCaches ServerName gomatrixserverlib.ServerName KeyRing gomatrixserverlib.JSONVerifier + ServerACLs *acls.ServerACLs fsAPI fsAPI.FederationInternalAPI asAPI asAPI.AppServiceQueryAPI + JetStream nats.JetStreamContext + Durable nats.SubOpt InputRoomEventTopic string // JetStream topic for new input room events OutputRoomEventTopic string // JetStream topic for new output room events PerspectiveServerNames []gomatrixserverlib.ServerName @@ -56,21 +59,17 @@ func NewRoomserverAPI( Cache: caches, ServerName: cfg.Matrix.ServerName, PerspectiveServerNames: perspectiveServerNames, + InputRoomEventTopic: inputRoomEventTopic, + OutputRoomEventTopic: outputRoomEventTopic, + JetStream: consumer, + Durable: cfg.Matrix.JetStream.Durable("RoomserverInputConsumer"), + ServerACLs: serverACLs, Queryer: &query.Queryer{ DB: roomserverDB, Cache: caches, ServerName: cfg.Matrix.ServerName, ServerACLs: serverACLs, }, - Inputer: &input.Inputer{ - DB: roomserverDB, - InputRoomEventTopic: inputRoomEventTopic, - OutputRoomEventTopic: outputRoomEventTopic, - JetStream: consumer, - Durable: cfg.Matrix.JetStream.Durable("RoomserverInputConsumer"), - ServerName: cfg.Matrix.ServerName, - ACLs: serverACLs, - }, // perform-er structs get initialised when we have a federation sender to use } return a @@ -83,6 +82,18 @@ func (r *RoomserverInternalAPI) SetFederationAPI(fsAPI fsAPI.FederationInternalA r.fsAPI = fsAPI r.KeyRing = keyRing + r.Inputer = &input.Inputer{ + DB: r.DB, + InputRoomEventTopic: r.InputRoomEventTopic, + OutputRoomEventTopic: r.OutputRoomEventTopic, + JetStream: r.JetStream, + Durable: r.Durable, + ServerName: r.Cfg.Matrix.ServerName, + FSAPI: fsAPI, + KeyRing: keyRing, + ACLs: r.ServerACLs, + Queryer: r.Queryer, + } r.Inviter = &perform.Inviter{ DB: r.DB, Cfg: r.Cfg, diff --git a/roomserver/internal/helpers/auth.go b/roomserver/internal/helpers/auth.go index 1f4215e7..ddda8081 100644 --- a/roomserver/internal/helpers/auth.go +++ b/roomserver/internal/helpers/auth.go @@ -56,7 +56,7 @@ func CheckForSoftFail( // Then get the state entries for the current state snapshot. // We'll use this to check if the event is allowed right now. - roomState := state.NewStateResolution(db, *roomInfo) + roomState := state.NewStateResolution(db, roomInfo) authStateEntries, err = roomState.LoadStateAtSnapshot(ctx, roomInfo.StateSnapshotNID) if err != nil { return true, fmt.Errorf("roomState.LoadStateAtSnapshot: %w", err) diff --git a/roomserver/internal/helpers/helpers.go b/roomserver/internal/helpers/helpers.go index a389cc89..78a875c7 100644 --- a/roomserver/internal/helpers/helpers.go +++ b/roomserver/internal/helpers/helpers.go @@ -179,7 +179,7 @@ func GetMembershipsAtState( return events, nil } -func StateBeforeEvent(ctx context.Context, db storage.Database, info types.RoomInfo, eventNID types.EventNID) ([]types.StateEntry, error) { +func StateBeforeEvent(ctx context.Context, db storage.Database, info *types.RoomInfo, eventNID types.EventNID) ([]types.StateEntry, error) { roomState := state.NewStateResolution(db, info) // Lookup the event NID eIDs, err := db.EventIDs(ctx, []types.EventNID{eventNID}) @@ -223,7 +223,7 @@ func LoadStateEvents( } func CheckServerAllowedToSeeEvent( - ctx context.Context, db storage.Database, info types.RoomInfo, eventID string, serverName gomatrixserverlib.ServerName, isServerInRoom bool, + ctx context.Context, db storage.Database, info *types.RoomInfo, eventID string, serverName gomatrixserverlib.ServerName, isServerInRoom bool, ) (bool, error) { roomState := state.NewStateResolution(db, info) stateEntries, err := roomState.LoadStateAtEvent(ctx, eventID) @@ -279,7 +279,7 @@ func CheckServerAllowedToSeeEvent( // TODO: Remove this when we have tests to assert correctness of this function func ScanEventTree( - ctx context.Context, db storage.Database, info types.RoomInfo, front []string, visited map[string]bool, limit int, + ctx context.Context, db storage.Database, info *types.RoomInfo, front []string, visited map[string]bool, limit int, serverName gomatrixserverlib.ServerName, ) ([]types.EventNID, error) { var resultNIDs []types.EventNID @@ -387,7 +387,7 @@ func QueryLatestEventsAndState( return nil } - roomState := state.NewStateResolution(db, *roomInfo) + roomState := state.NewStateResolution(db, roomInfo) response.RoomExists = true response.RoomVersion = roomInfo.RoomVersion diff --git a/roomserver/internal/input/input.go b/roomserver/internal/input/input.go index 57e51055..9601e018 100644 --- a/roomserver/internal/input/input.go +++ b/roomserver/internal/input/input.go @@ -19,12 +19,15 @@ import ( "context" "encoding/json" "sync" + "time" "github.com/Arceliar/phony" "github.com/getsentry/sentry-go" + fedapi "github.com/matrix-org/dendrite/federationapi/api" "github.com/matrix-org/dendrite/internal/hooks" "github.com/matrix-org/dendrite/roomserver/acls" "github.com/matrix-org/dendrite/roomserver/api" + "github.com/matrix-org/dendrite/roomserver/internal/query" "github.com/matrix-org/dendrite/roomserver/storage" "github.com/matrix-org/dendrite/setup/jetstream" "github.com/matrix-org/gomatrixserverlib" @@ -45,12 +48,28 @@ type Inputer struct { JetStream nats.JetStreamContext Durable nats.SubOpt ServerName gomatrixserverlib.ServerName + FSAPI fedapi.FederationInternalAPI + KeyRing gomatrixserverlib.JSONVerifier ACLs *acls.ServerACLs InputRoomEventTopic string OutputRoomEventTopic string workers sync.Map // room ID -> *phony.Inbox + + Queryer *query.Queryer +} + +func (r *Inputer) workerForRoom(roomID string) *phony.Inbox { + inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + return inbox.(*phony.Inbox) } +// eventsInProgress is an in-memory map to keep a track of which events we have +// queued up for processing. If we get a redelivery from NATS and we still have +// the queued up item then we won't do anything with the redelivered message. If +// we've restarted Dendrite and now this map is empty then it means that we will +// reload pending work from NATS. +var eventsInProgress sync.Map + // onMessage is called when a new event arrives in the roomserver input stream. func (r *Inputer) Start() error { _, err := r.JetStream.Subscribe( @@ -65,11 +84,23 @@ func (r *Inputer) Start() error { _ = msg.Term() return } - inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + + _ = msg.InProgress() + index := roomID + "\000" + inputRoomEvent.Event.EventID() + if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok { + // We're already waiting to deal with this event, so there's no + // point in queuing it up again. We've notified NATS that we're + // working on the message still, so that will have deferred the + // redelivery by a bit. + return + } + roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc() - inbox.(*phony.Inbox).Act(nil, func() { + r.workerForRoom(roomID).Act(nil, func() { + _ = msg.InProgress() // resets the acknowledgement wait timer + defer eventsInProgress.Delete(index) defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec() - if err := r.processRoomEvent(context.TODO(), &inputRoomEvent); err != nil { + if err := r.processRoomEvent(context.Background(), &inputRoomEvent); err != nil { sentry.CaptureException(err) } else { hooks.Run(hooks.KindNewEventPersisted, inputRoomEvent.Event) @@ -82,12 +113,14 @@ func (r *Inputer) Start() error { // sure that we only acknowledge when we're happy we've done everything we // can. This ensures we retry things when it makes sense to do so. nats.ManualAck(), - // NATS will try to redeliver things to us automatically if we don't ack - // or nak them within a certain amount of time. This stops that from - // happening, so we don't end up doing a lot of unnecessary duplicate work. - nats.MaxDeliver(0), // Use a durable named consumer. r.Durable, + // If we've missed things in the stream, e.g. we restarted, then replay + // all of the queued messages that were waiting for us. + nats.DeliverAll(), + // Ensure that NATS doesn't try to resend us something that wasn't done + // within the period of time that we might still be processing it. + nats.AckWait(MaximumProcessingTime+(time.Second*10)), ) return err } @@ -122,11 +155,20 @@ func (r *Inputer) InputRoomEvents( for _, e := range request.InputRoomEvents { inputRoomEvent := e roomID := inputRoomEvent.Event.RoomID() - inbox, _ := r.workers.LoadOrStore(roomID, &phony.Inbox{}) + index := roomID + "\000" + inputRoomEvent.Event.EventID() + if _, ok := eventsInProgress.LoadOrStore(index, struct{}{}); ok { + // We're already waiting to deal with this event, so there's no + // point in queuing it up again. We've notified NATS that we're + // working on the message still, so that will have deferred the + // redelivery by a bit. + return + } roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Inc() - inbox.(*phony.Inbox).Act(nil, func() { + worker := r.workerForRoom(roomID) + worker.Act(nil, func() { + defer eventsInProgress.Delete(index) defer roomserverInputBackpressure.With(prometheus.Labels{"room_id": roomID}).Dec() - err := r.processRoomEvent(context.TODO(), &inputRoomEvent) + err := r.processRoomEvent(ctx, &inputRoomEvent) if err != nil { sentry.CaptureException(err) } else { @@ -142,6 +184,7 @@ func (r *Inputer) InputRoomEvents( for i := 0; i < len(request.InputRoomEvents); i++ { select { case <-ctx.Done(): + response.ErrMsg = context.DeadlineExceeded.Error() return case err := <-responses: if err != nil { diff --git a/roomserver/internal/input/input_events.go b/roomserver/internal/input/input_events.go index 791f7f30..5f911522 100644 --- a/roomserver/internal/input/input_events.go +++ b/roomserver/internal/input/input_events.go @@ -22,6 +22,8 @@ import ( "fmt" "time" + fedapi "github.com/matrix-org/dendrite/federationapi/api" + "github.com/matrix-org/dendrite/internal" "github.com/matrix-org/dendrite/internal/eventutil" "github.com/matrix-org/dendrite/roomserver/api" "github.com/matrix-org/dendrite/roomserver/internal/helpers" @@ -37,6 +39,9 @@ func init() { prometheus.MustRegister(processRoomEventDuration) } +// TODO: Does this value make sense? +const MaximumProcessingTime = time.Minute * 2 + var processRoomEventDuration = prometheus.NewHistogramVec( prometheus.HistogramOpts{ Namespace: "dendrite", @@ -60,9 +65,25 @@ var processRoomEventDuration = prometheus.NewHistogramVec( // TODO: Break up function - we should probably do transaction ID checks before calling this. // nolint:gocyclo func (r *Inputer) processRoomEvent( - ctx context.Context, + inctx context.Context, input *api.InputRoomEvent, ) (err error) { + select { + case <-inctx.Done(): + // Before we do anything, make sure the context hasn't expired for this pending task. + // If it has then we'll give up straight away — it's probably a synchronous input + // request and the caller has already given up, but the inbox task was still queued. + return context.DeadlineExceeded + default: + } + + // Wrap the context with a time limit. We'll allow no more than MaximumProcessingTime for + // everything that we need to do for this event, or it's possible that we could end up wedging + // the roomserver for a very long time. + var cancel context.CancelFunc + ctx, cancel := context.WithTimeout(inctx, MaximumProcessingTime) + defer cancel() + // Measure how long it takes to process this event. started := time.Now() defer func() { @@ -75,6 +96,11 @@ func (r *Inputer) processRoomEvent( // Parse and validate the event JSON headered := input.Event event := headered.Unwrap() + logger := util.GetLogger(ctx).WithFields(logrus.Fields{ + "event_id": event.EventID(), + "room_id": event.RoomID(), + "type": event.Type(), + }) // if we have already got this event then do not process it again, if the input kind is an outlier. // Outliers contain no extra information which may warrant a re-processing. @@ -87,24 +113,67 @@ func (r *Inputer) processRoomEvent( switch idFormat { case gomatrixserverlib.EventIDFormatV1: if bytes.Equal(event.EventReference().EventSHA256, evs[0].EventReference().EventSHA256) { - util.GetLogger(ctx).WithField("event_id", event.EventID()).Infof("Already processed event; ignoring") + logger.Debugf("Already processed event; ignoring") return nil } default: - util.GetLogger(ctx).WithField("event_id", event.EventID()).Infof("Already processed event; ignoring") + logger.Debugf("Already processed event; ignoring") return nil } } } } - // Check that the event passes authentication checks and work out - // the numeric IDs for the auth events. + missingRes := &api.QueryMissingAuthPrevEventsResponse{} + serverRes := &fedapi.QueryJoinedHostServerNamesInRoomResponse{} + if event.Type() != gomatrixserverlib.MRoomCreate || !event.StateKeyEquals("") { + missingReq := &api.QueryMissingAuthPrevEventsRequest{ + RoomID: event.RoomID(), + AuthEventIDs: event.AuthEventIDs(), + PrevEventIDs: event.PrevEventIDs(), + } + if err = r.Queryer.QueryMissingAuthPrevEvents(ctx, missingReq, missingRes); err != nil { + return fmt.Errorf("r.Queryer.QueryMissingAuthPrevEvents: %w", err) + } + } + if len(missingRes.MissingAuthEventIDs) > 0 || len(missingRes.MissingPrevEventIDs) > 0 { + serverReq := &fedapi.QueryJoinedHostServerNamesInRoomRequest{ + RoomID: event.RoomID(), + ExcludeSelf: true, + } + if err = r.FSAPI.QueryJoinedHostServerNamesInRoom(ctx, serverReq, serverRes); err != nil { + return fmt.Errorf("r.FSAPI.QueryJoinedHostServerNamesInRoom: %w", err) + } + } + if input.Origin != "" { + serverRes.ServerNames = append(serverRes.ServerNames, input.Origin) + } + + // First of all, check that the auth events of the event are known. + // If they aren't then we will ask the federation API for them. isRejected := false - authEventNIDs, rejectionErr := helpers.CheckAuthEvents(ctx, r.DB, headered, input.AuthEventIDs) - if rejectionErr != nil { - logrus.WithError(rejectionErr).WithField("event_id", event.EventID()).WithField("auth_event_ids", input.AuthEventIDs).Error("helpers.CheckAuthEvents failed for event, rejecting event") + authEvents := gomatrixserverlib.NewAuthEvents(nil) + knownEvents := map[string]*types.Event{} + if err = r.fetchAuthEvents(ctx, logger, headered, &authEvents, knownEvents, serverRes.ServerNames); err != nil { + return fmt.Errorf("r.checkForMissingAuthEvents: %w", err) + } + + // Check if the event is allowed by its auth events. If it isn't then + // we consider the event to be "rejected" — it will still be persisted. + var rejectionErr error + if rejectionErr = gomatrixserverlib.Allowed(event, &authEvents); rejectionErr != nil { isRejected = true + logger.WithError(rejectionErr).Warnf("Event %s rejected", event.EventID()) + } + + // Accumulate the auth event NIDs. + authEventIDs := event.AuthEventIDs() + authEventNIDs := make([]types.EventNID, 0, len(authEventIDs)) + for _, authEventID := range authEventIDs { + if _, ok := knownEvents[authEventID]; !ok { + return fmt.Errorf("missing auth event %s", authEventID) + } + authEventNIDs = append(authEventNIDs, knownEvents[authEventID].EventNID) } var softfail bool @@ -113,11 +182,50 @@ func (r *Inputer) processRoomEvent( // current room state. softfail, err = helpers.CheckForSoftFail(ctx, r.DB, headered, input.StateEventIDs) if err != nil { - logrus.WithFields(logrus.Fields{ - "event_id": event.EventID(), - "type": event.Type(), - "room": event.RoomID(), - }).WithError(err).Info("Error authing soft-failed event") + logger.WithError(err).Info("Error authing soft-failed event") + } + } + + // At this point we are checking whether we know all of the prev events, and + // if we know the state before the prev events. This is necessary before we + // try to do `calculateAndSetState` on the event later, otherwise it will fail + // with missing event NIDs. If there's anything missing then we'll go and fetch + // the prev events and state from the federation. Note that we only do this if + // we weren't already told what the state before the event should be — if the + // HasState option was set and a state set was provided (as is the case in a + // typical federated room join) then we won't bother trying to fetch prev events + // because we may not be allowed to see them and we have no choice but to trust + // the state event IDs provided to us in the join instead. + missingPrev := !input.HasState && len(missingRes.MissingPrevEventIDs) > 0 + if missingPrev && input.Kind == api.KindNew { + // Don't do this for KindOld events, otherwise old events that we fetch + // to satisfy missing prev events/state will end up recursively calling + // processRoomEvent. + if len(serverRes.ServerNames) > 0 { + missingState := missingStateReq{ + origin: input.Origin, + inputer: r, + queryer: r.Queryer, + db: r.DB, + federation: r.FSAPI, + keys: r.KeyRing, + roomsMu: internal.NewMutexByRoom(), + servers: map[gomatrixserverlib.ServerName]struct{}{}, + hadEvents: map[string]bool{}, + haveEvents: map[string]*gomatrixserverlib.HeaderedEvent{}, + } + for _, serverName := range serverRes.ServerNames { + missingState.servers[serverName] = struct{}{} + } + if err = missingState.processEventWithMissingState(ctx, event, headered.RoomVersion); err != nil { + isRejected = true + rejectionErr = fmt.Errorf("missingState.processEventWithMissingState: %w", err) + } else { + missingPrev = false + } + } else { + isRejected = true + rejectionErr = fmt.Errorf("missing prev events and no other servers to ask") } } @@ -140,12 +248,7 @@ func (r *Inputer) processRoomEvent( // doesn't have any associated state to store and we don't need to // notify anyone about it. if input.Kind == api.KindOutlier { - logrus.WithFields(logrus.Fields{ - "event_id": event.EventID(), - "type": event.Type(), - "room": event.RoomID(), - "sender": event.Sender(), - }).Debug("Stored outlier") + logger.Debug("Stored outlier") return nil } @@ -157,24 +260,18 @@ func (r *Inputer) processRoomEvent( return fmt.Errorf("r.DB.RoomInfo missing for room %s", event.RoomID()) } - if stateAtEvent.BeforeStateSnapshotNID == 0 { + if !missingPrev && stateAtEvent.BeforeStateSnapshotNID == 0 { // We haven't calculated a state for this event yet. // Lets calculate one. - err = r.calculateAndSetState(ctx, input, *roomInfo, &stateAtEvent, event, isRejected) - if err != nil && input.Kind != api.KindOld { + err = r.calculateAndSetState(ctx, input, roomInfo, &stateAtEvent, event, isRejected) + if err != nil { return fmt.Errorf("r.calculateAndSetState: %w", err) } } // We stop here if the event is rejected: We've stored it but won't update forward extremities or notify anyone about it. if isRejected || softfail { - logrus.WithFields(logrus.Fields{ - "event_id": event.EventID(), - "type": event.Type(), - "room": event.RoomID(), - "soft_fail": softfail, - "sender": event.Sender(), - }).Debug("Stored rejected event") + logger.WithError(rejectionErr).WithField("soft_fail", softfail).Debug("Stored rejected event") return rejectionErr } @@ -228,10 +325,127 @@ func (r *Inputer) processRoomEvent( return nil } +// fetchAuthEvents will check to see if any of the +// auth events specified by the given event are unknown. If they are +// then we will go off and request them from the federation and then +// store them in the database. By the time this function ends, either +// we've failed to retrieve the auth chain altogether (in which case +// an error is returned) or we've successfully retrieved them all and +// they are now in the database. +func (r *Inputer) fetchAuthEvents( + ctx context.Context, + logger *logrus.Entry, + event *gomatrixserverlib.HeaderedEvent, + auth *gomatrixserverlib.AuthEvents, + known map[string]*types.Event, + servers []gomatrixserverlib.ServerName, +) error { + unknown := map[string]struct{}{} + authEventIDs := event.AuthEventIDs() + if len(authEventIDs) == 0 { + return nil + } + + for _, authEventID := range authEventIDs { + authEvents, err := r.DB.EventsFromIDs(ctx, []string{authEventID}) + if err != nil || len(authEvents) == 0 || authEvents[0].Event == nil { + unknown[authEventID] = struct{}{} + continue + } + ev := authEvents[0] + known[authEventID] = &ev // don't take the pointer of the iterated event + if err = auth.AddEvent(ev.Event); err != nil { + return fmt.Errorf("auth.AddEvent: %w", err) + } + } + + // If there are no missing auth events then there is nothing more + // to do — we've loaded everything that we need. + if len(unknown) == 0 { + return nil + } + + var err error + var res gomatrixserverlib.RespEventAuth + var found bool + for _, serverName := range servers { + // Request the entire auth chain for the event in question. This should + // contain all of the auth events — including ones that we already know — + // so we'll need to filter through those in the next section. + res, err = r.FSAPI.GetEventAuth(ctx, serverName, event.RoomVersion, event.RoomID(), event.EventID()) + if err != nil { + logger.WithError(err).Warnf("Failed to get event auth from federation for %q: %s", event.EventID(), err) + continue + } + found = true + break + } + if !found { + return fmt.Errorf("no servers provided event auth for event ID %q, tried servers %v", event.EventID(), servers) + } + + for _, authEvent := range gomatrixserverlib.ReverseTopologicalOrdering( + res.AuthEvents, + gomatrixserverlib.TopologicalOrderByAuthEvents, + ) { + // If we already know about this event from the database then we don't + // need to store it again or do anything further with it, so just skip + // over it rather than wasting cycles. + if ev, ok := known[authEvent.EventID()]; ok && ev != nil { + continue + } + + // Check the signatures of the event. + // TODO: It really makes sense for the federation API to be doing this, + // because then it can attempt another server if one serves up an event + // with an invalid signature. For now this will do. + if err := authEvent.VerifyEventSignatures(ctx, r.FSAPI.KeyRing()); err != nil { + return fmt.Errorf("event.VerifyEventSignatures: %w", err) + } + + // In order to store the new auth event, we need to know its auth chain + // as NIDs for the `auth_event_nids` column. Let's see if we can find those. + authEventNIDs := make([]types.EventNID, 0, len(authEvent.AuthEventIDs())) + for _, eventID := range authEvent.AuthEventIDs() { + knownEvent, ok := known[eventID] + if !ok { + return fmt.Errorf("missing auth event %s for %s", eventID, authEvent.EventID()) + } + authEventNIDs = append(authEventNIDs, knownEvent.EventNID) + } + + // Let's take a note of the fact that we now know about this event. + if err := auth.AddEvent(authEvent); err != nil { + return fmt.Errorf("auth.AddEvent: %w", err) + } + + // Check if the auth event should be rejected. + isRejected := false + if err := gomatrixserverlib.Allowed(authEvent, auth); err != nil { + isRejected = true + logger.WithError(err).Warnf("Auth event %s rejected", authEvent.EventID()) + } + + // Finally, store the event in the database. + eventNID, _, _, _, _, err := r.DB.StoreEvent(ctx, authEvent, authEventNIDs, isRejected) + if err != nil { + return fmt.Errorf("r.DB.StoreEvent: %w", err) + } + + // Now we know about this event, it was stored and the signatures were OK. + known[authEvent.EventID()] = &types.Event{ + EventNID: eventNID, + Event: authEvent, + } + } + + return nil +} + func (r *Inputer) calculateAndSetState( ctx context.Context, input *api.InputRoomEvent, - roomInfo types.RoomInfo, + roomInfo *types.RoomInfo, stateAtEvent *types.StateAtEvent, event *gomatrixserverlib.Event, isRejected bool, diff --git a/roomserver/internal/input/input_latest_events.go b/roomserver/internal/input/input_latest_events.go index c9264a27..6137941e 100644 --- a/roomserver/internal/input/input_latest_events.go +++ b/roomserver/internal/input/input_latest_events.go @@ -199,7 +199,7 @@ func (u *latestEventsUpdater) doUpdateLatestEvents() error { func (u *latestEventsUpdater) latestState() error { var err error - roomState := state.NewStateResolution(u.api.DB, *u.roomInfo) + roomState := state.NewStateResolution(u.api.DB, u.roomInfo) // Work out if the state at the extremities has actually changed // or not. If they haven't then we won't bother doing all of the diff --git a/roomserver/internal/input/input_missing.go b/roomserver/internal/input/input_missing.go new file mode 100644 index 00000000..44710962 --- /dev/null +++ b/roomserver/internal/input/input_missing.go @@ -0,0 +1,765 @@ +package input + +import ( + "context" + "errors" + "fmt" + "sync" + "time" + + fedapi "github.com/matrix-org/dendrite/federationapi/api" + "github.com/matrix-org/dendrite/internal" + "github.com/matrix-org/dendrite/roomserver/api" + "github.com/matrix-org/dendrite/roomserver/internal/query" + "github.com/matrix-org/dendrite/roomserver/storage" + "github.com/matrix-org/gomatrixserverlib" + "github.com/matrix-org/util" + "github.com/sirupsen/logrus" +) + +type missingStateReq struct { + origin gomatrixserverlib.ServerName + db storage.Database + inputer *Inputer + queryer *query.Queryer + keys gomatrixserverlib.JSONVerifier + federation fedapi.FederationInternalAPI + roomsMu *internal.MutexByRoom + servers map[gomatrixserverlib.ServerName]struct{} + hadEvents map[string]bool + hadEventsMutex sync.Mutex + haveEvents map[string]*gomatrixserverlib.HeaderedEvent + haveEventsMutex sync.Mutex +} + +// processEventWithMissingState is the entrypoint for a missingStateReq +// request, as called from processRoomEvent. +func (t *missingStateReq) processEventWithMissingState( + ctx context.Context, e *gomatrixserverlib.Event, roomVersion gomatrixserverlib.RoomVersion, +) error { + // We are missing the previous events for this events. + // This means that there is a gap in our view of the history of the + // room. There two ways that we can handle such a gap: + // 1) We can fill in the gap using /get_missing_events + // 2) We can leave the gap and request the state of the room at + // this event from the remote server using either /state_ids + // or /state. + // Synapse will attempt to do 1 and if that fails or if the gap is + // too large then it will attempt 2. + // Synapse will use /state_ids if possible since usually the state + // is largely unchanged and it is more efficient to fetch a list of + // event ids and then use /event to fetch the individual events. + // However not all version of synapse support /state_ids so you may + // need to fallback to /state. + logger := util.GetLogger(ctx).WithFields(map[string]interface{}{ + "txn_event": e.EventID(), + "room_id": e.RoomID(), + "txn_prev_events": e.PrevEventIDs(), + }) + + // Attempt to fill in the gap using /get_missing_events + // This will either: + // - fill in the gap completely then process event `e` returning no backwards extremity + // - fail to fill in the gap and tell us to terminate the transaction err=not nil + // - fail to fill in the gap and tell us to fetch state at the new backwards extremity, and to not terminate the transaction + newEvents, isGapFilled, err := t.getMissingEvents(ctx, e, roomVersion) + if err != nil { + return fmt.Errorf("t.getMissingEvents: %w", err) + } + if len(newEvents) == 0 { + return fmt.Errorf("expected to find missing events but didn't") + } + if isGapFilled { + logger.Infof("gap filled by /get_missing_events, injecting %d new events", len(newEvents)) + // we can just inject all the newEvents as new as we may have only missed 1 or 2 events and have filled + // in the gap in the DAG + for _, newEvent := range newEvents { + err = t.inputer.processRoomEvent(ctx, &api.InputRoomEvent{ + Kind: api.KindNew, + Event: newEvent.Headered(roomVersion), + Origin: t.origin, + SendAsServer: api.DoNotSendToOtherServers, + }) + if err != nil { + return fmt.Errorf("t.inputer.processRoomEvent: %w", err) + } + } + return nil + } + + backwardsExtremity := newEvents[0] + newEvents = newEvents[1:] + + type respState struct { + // A snapshot is considered trustworthy if it came from our own roomserver. + // That's because the state will have been through state resolution once + // already in QueryStateAfterEvent. + trustworthy bool + *gomatrixserverlib.RespState + } + + // at this point we know we're going to have a gap: we need to work out the room state at the new backwards extremity. + // Therefore, we cannot just query /state_ids with this event to get the state before. Instead, we need to query + // the state AFTER all the prev_events for this event, then apply state resolution to that to get the state before the event. + var states []*respState + for _, prevEventID := range backwardsExtremity.PrevEventIDs() { + // Look up what the state is after the backward extremity. This will either + // come from the roomserver, if we know all the required events, or it will + // come from a remote server via /state_ids if not. + prevState, trustworthy, lerr := t.lookupStateAfterEvent(ctx, roomVersion, backwardsExtremity.RoomID(), prevEventID) + if lerr != nil { + logger.WithError(lerr).Errorf("Failed to lookup state after prev_event: %s", prevEventID) + return lerr + } + // Append the state onto the collected state. We'll run this through the + // state resolution next. + states = append(states, &respState{trustworthy, prevState}) + } + + // Now that we have collected all of the state from the prev_events, we'll + // run the state through the appropriate state resolution algorithm for the + // room if needed. This does a couple of things: + // 1. Ensures that the state is deduplicated fully for each state-key tuple + // 2. Ensures that we pick the latest events from both sets, in the case that + // one of the prev_events is quite a bit older than the others + resolvedState := &gomatrixserverlib.RespState{} + switch len(states) { + case 0: + extremityIsCreate := backwardsExtremity.Type() == gomatrixserverlib.MRoomCreate && backwardsExtremity.StateKeyEquals("") + if !extremityIsCreate { + // There are no previous states and this isn't the beginning of the + // room - this is an error condition! + logger.Errorf("Failed to lookup any state after prev_events") + return fmt.Errorf("expected %d states but got %d", len(backwardsExtremity.PrevEventIDs()), len(states)) + } + case 1: + // There's only one previous state - if it's trustworthy (came from a + // local state snapshot which will already have been through state res), + // use it as-is. There's no point in resolving it again. + if states[0].trustworthy { + resolvedState = states[0].RespState + break + } + // Otherwise, if it isn't trustworthy (came from federation), run it through + // state resolution anyway for safety, in case there are duplicates. + fallthrough + default: + respStates := make([]*gomatrixserverlib.RespState, len(states)) + for i := range states { + respStates[i] = states[i].RespState + } + // There's more than one previous state - run them all through state res + t.roomsMu.Lock(e.RoomID()) + resolvedState, err = t.resolveStatesAndCheck(ctx, roomVersion, respStates, backwardsExtremity) + t.roomsMu.Unlock(e.RoomID()) + if err != nil { + logger.WithError(err).Errorf("Failed to resolve state conflicts for event %s", backwardsExtremity.EventID()) + return err + } + } + + hadEvents := map[string]bool{} + t.hadEventsMutex.Lock() + for k, v := range t.hadEvents { + hadEvents[k] = v + } + t.hadEventsMutex.Unlock() + + // Send outliers first so we can send the new backwards extremity without causing errors + outliers, err := resolvedState.Events() + if err != nil { + return err + } + var outlierRoomEvents []api.InputRoomEvent + for _, outlier := range outliers { + if hadEvents[outlier.EventID()] { + continue + } + outlierRoomEvents = append(outlierRoomEvents, api.InputRoomEvent{ + Kind: api.KindOutlier, + Event: outlier.Headered(roomVersion), + Origin: t.origin, + }) + } + // TODO: we could do this concurrently? + for _, ire := range outlierRoomEvents { + if err = t.inputer.processRoomEvent(ctx, &ire); err != nil { + return fmt.Errorf("t.inputer.processRoomEvent[outlier]: %w", err) + } + } + + // Now send the backward extremity into the roomserver with the + // newly resolved state. This marks the "oldest" point in the backfill and + // sets the baseline state for any new events after this. + stateIDs := make([]string, 0, len(resolvedState.StateEvents)) + for _, event := range resolvedState.StateEvents { + stateIDs = append(stateIDs, event.EventID()) + } + + err = t.inputer.processRoomEvent(ctx, &api.InputRoomEvent{ + Kind: api.KindOld, + Event: backwardsExtremity.Headered(roomVersion), + Origin: t.origin, + HasState: true, + StateEventIDs: stateIDs, + SendAsServer: api.DoNotSendToOtherServers, + }) + if err != nil { + return fmt.Errorf("t.inputer.processRoomEvent: %w", err) + } + + // Then send all of the newer backfilled events, of which will all be newer + // than the backward extremity, into the roomserver without state. This way + // they will automatically fast-forward based on the room state at the + // extremity in the last step. + for _, newEvent := range newEvents { + err = t.inputer.processRoomEvent(ctx, &api.InputRoomEvent{ + Kind: api.KindOld, + Event: newEvent.Headered(roomVersion), + Origin: t.origin, + SendAsServer: api.DoNotSendToOtherServers, + }) + if err != nil { + return fmt.Errorf("t.inputer.processRoomEvent: %w", err) + } + } + + return nil +} + +// lookupStateAfterEvent returns the room state after `eventID`, which is the state before eventID with the state of `eventID` (if it's a state event) +// added into the mix. +func (t *missingStateReq) lookupStateAfterEvent(ctx context.Context, roomVersion gomatrixserverlib.RoomVersion, roomID, eventID string) (*gomatrixserverlib.RespState, bool, error) { + // try doing all this locally before we resort to querying federation + respState := t.lookupStateAfterEventLocally(ctx, roomID, eventID) + if respState != nil { + return respState, true, nil + } + + respState, err := t.lookupStateBeforeEvent(ctx, roomVersion, roomID, eventID) + if err != nil { + return nil, false, fmt.Errorf("t.lookupStateBeforeEvent: %w", err) + } + + // fetch the event we're missing and add it to the pile + h, err := t.lookupEvent(ctx, roomVersion, roomID, eventID, false) + switch err.(type) { + case verifySigError: + return respState, false, nil + case nil: + // do nothing + default: + return nil, false, fmt.Errorf("t.lookupEvent: %w", err) + } + h = t.cacheAndReturn(h) + if h.StateKey() != nil { + addedToState := false + for i := range respState.StateEvents { + se := respState.StateEvents[i] + if se.Type() == h.Type() && se.StateKeyEquals(*h.StateKey()) { + respState.StateEvents[i] = h.Unwrap() + addedToState = true + break + } + } + if !addedToState { + respState.StateEvents = append(respState.StateEvents, h.Unwrap()) + } + } + + return respState, false, nil +} + +func (t *missingStateReq) cacheAndReturn(ev *gomatrixserverlib.HeaderedEvent) *gomatrixserverlib.HeaderedEvent { + t.haveEventsMutex.Lock() + defer t.haveEventsMutex.Unlock() + if cached, exists := t.haveEvents[ev.EventID()]; exists { + return cached + } + t.haveEvents[ev.EventID()] = ev + return ev +} + +func (t *missingStateReq) lookupStateAfterEventLocally(ctx context.Context, roomID, eventID string) *gomatrixserverlib.RespState { + var res api.QueryStateAfterEventsResponse + err := t.queryer.QueryStateAfterEvents(ctx, &api.QueryStateAfterEventsRequest{ + RoomID: roomID, + PrevEventIDs: []string{eventID}, + }, &res) + if err != nil || !res.PrevEventsExist { + util.GetLogger(ctx).WithField("room_id", roomID).WithError(err).Warnf("failed to query state after %s locally, prev exists=%v", eventID, res.PrevEventsExist) + return nil + } + stateEvents := make([]*gomatrixserverlib.HeaderedEvent, len(res.StateEvents)) + for i, ev := range res.StateEvents { + // set the event from the haveEvents cache - this means we will share pointers with other prev_event branches for this + // processEvent request, which is better for memory. + stateEvents[i] = t.cacheAndReturn(ev) + t.hadEvent(ev.EventID()) + } + // we should never access res.StateEvents again so we delete it here to make GC faster + res.StateEvents = nil + + var authEvents []*gomatrixserverlib.Event + missingAuthEvents := map[string]bool{} + for _, ev := range stateEvents { + t.haveEventsMutex.Lock() + for _, ae := range ev.AuthEventIDs() { + if aev, ok := t.haveEvents[ae]; ok { + authEvents = append(authEvents, aev.Unwrap()) + } else { + missingAuthEvents[ae] = true + } + } + t.haveEventsMutex.Unlock() + } + // QueryStateAfterEvents does not return the auth events, so fetch them now. We know the roomserver has them else it wouldn't + // have stored the event. + if len(missingAuthEvents) > 0 { + var missingEventList []string + for evID := range missingAuthEvents { + missingEventList = append(missingEventList, evID) + } + queryReq := api.QueryEventsByIDRequest{ + EventIDs: missingEventList, + } + util.GetLogger(ctx).WithField("count", len(missingEventList)).Infof("Fetching missing auth events") + var queryRes api.QueryEventsByIDResponse + if err = t.queryer.QueryEventsByID(ctx, &queryReq, &queryRes); err != nil { + return nil + } + for i, ev := range queryRes.Events { + authEvents = append(authEvents, t.cacheAndReturn(queryRes.Events[i]).Unwrap()) + t.hadEvent(ev.EventID()) + } + queryRes.Events = nil + } + + return &gomatrixserverlib.RespState{ + StateEvents: gomatrixserverlib.UnwrapEventHeaders(stateEvents), + AuthEvents: authEvents, + } +} + +// lookuptStateBeforeEvent returns the room state before the event e, which is just /state_ids and/or /state depending on what +// the server supports. +func (t *missingStateReq) lookupStateBeforeEvent(ctx context.Context, roomVersion gomatrixserverlib.RoomVersion, roomID, eventID string) ( + *gomatrixserverlib.RespState, error) { + + // Attempt to fetch the missing state using /state_ids and /events + return t.lookupMissingStateViaStateIDs(ctx, roomID, eventID, roomVersion) +} + +func (t *missingStateReq) resolveStatesAndCheck(ctx context.Context, roomVersion gomatrixserverlib.RoomVersion, states []*gomatrixserverlib.RespState, backwardsExtremity *gomatrixserverlib.Event) (*gomatrixserverlib.RespState, error) { + var authEventList []*gomatrixserverlib.Event + var stateEventList []*gomatrixserverlib.Event + for _, state := range states { + authEventList = append(authEventList, state.AuthEvents...) + stateEventList = append(stateEventList, state.StateEvents...) + } + resolvedStateEvents, err := gomatrixserverlib.ResolveConflicts(roomVersion, stateEventList, authEventList) + if err != nil { + return nil, err + } + // apply the current event +retryAllowedState: + if err = checkAllowedByState(backwardsExtremity, resolvedStateEvents); err != nil { + switch missing := err.(type) { + case gomatrixserverlib.MissingAuthEventError: + h, err2 := t.lookupEvent(ctx, roomVersion, backwardsExtremity.RoomID(), missing.AuthEventID, true) + switch err2.(type) { + case verifySigError: + return &gomatrixserverlib.RespState{ + AuthEvents: authEventList, + StateEvents: resolvedStateEvents, + }, nil + case nil: + // do nothing + default: + return nil, fmt.Errorf("missing auth event %s and failed to look it up: %w", missing.AuthEventID, err2) + } + util.GetLogger(ctx).Infof("fetched event %s", missing.AuthEventID) + resolvedStateEvents = append(resolvedStateEvents, h.Unwrap()) + goto retryAllowedState + default: + } + return nil, err + } + return &gomatrixserverlib.RespState{ + AuthEvents: authEventList, + StateEvents: resolvedStateEvents, + }, nil +} + +// get missing events for `e`. If `isGapFilled`=true then `newEvents` contains all the events to inject, +// without `e`. If `isGapFilled=false` then `newEvents` contains the response to /get_missing_events +func (t *missingStateReq) getMissingEvents(ctx context.Context, e *gomatrixserverlib.Event, roomVersion gomatrixserverlib.RoomVersion) (newEvents []*gomatrixserverlib.Event, isGapFilled bool, err error) { + logger := util.GetLogger(ctx).WithField("event_id", e.EventID()).WithField("room_id", e.RoomID()) + needed := gomatrixserverlib.StateNeededForAuth([]*gomatrixserverlib.Event{e}) + // query latest events (our trusted forward extremities) + req := api.QueryLatestEventsAndStateRequest{ + RoomID: e.RoomID(), + StateToFetch: needed.Tuples(), + } + var res api.QueryLatestEventsAndStateResponse + if err = t.queryer.QueryLatestEventsAndState(ctx, &req, &res); err != nil { + logger.WithError(err).Warn("Failed to query latest events") + return nil, false, err + } + latestEvents := make([]string, len(res.LatestEvents)) + for i, ev := range res.LatestEvents { + latestEvents[i] = res.LatestEvents[i].EventID + t.hadEvent(ev.EventID) + } + + var missingResp *gomatrixserverlib.RespMissingEvents + for server := range t.servers { + var m gomatrixserverlib.RespMissingEvents + if m, err = t.federation.LookupMissingEvents(ctx, server, e.RoomID(), gomatrixserverlib.MissingEvents{ + Limit: 20, + // The latest event IDs that the sender already has. These are skipped when retrieving the previous events of latest_events. + EarliestEvents: latestEvents, + // The event IDs to retrieve the previous events for. + LatestEvents: []string{e.EventID()}, + }, roomVersion); err == nil { + missingResp = &m + break + } else { + logger.WithError(err).Errorf("%s pushed us an event but %q did not respond to /get_missing_events", t.origin, server) + if errors.Is(err, context.DeadlineExceeded) { + select { + case <-ctx.Done(): // the parent request context timed out + return nil, false, context.DeadlineExceeded + default: // this request exceed its own timeout + continue + } + } + } + } + + if missingResp == nil { + logger.WithError(err).Errorf( + "%s pushed us an event but %d server(s) couldn't give us details about prev_events via /get_missing_events - dropping this event until it can", + t.origin, len(t.servers), + ) + return nil, false, missingPrevEventsError{ + eventID: e.EventID(), + err: err, + } + } + + // Make sure events from the missingResp are using the cache - missing events + // will be added and duplicates will be removed. + logger.Infof("get_missing_events returned %d events", len(missingResp.Events)) + for i, ev := range missingResp.Events { + missingResp.Events[i] = t.cacheAndReturn(ev.Headered(roomVersion)).Unwrap() + } + + // topologically sort and sanity check that we are making forward progress + newEvents = gomatrixserverlib.ReverseTopologicalOrdering(missingResp.Events, gomatrixserverlib.TopologicalOrderByPrevEvents) + shouldHaveSomeEventIDs := e.PrevEventIDs() + hasPrevEvent := false +Event: + for _, pe := range shouldHaveSomeEventIDs { + for _, ev := range newEvents { + if ev.EventID() == pe { + hasPrevEvent = true + break Event + } + } + } + if !hasPrevEvent { + err = fmt.Errorf("called /get_missing_events but server %s didn't return any prev_events with IDs %v", t.origin, shouldHaveSomeEventIDs) + logger.WithError(err).Errorf( + "%s pushed us an event but couldn't give us details about prev_events via /get_missing_events - dropping this event until it can", + t.origin, + ) + return nil, false, missingPrevEventsError{ + eventID: e.EventID(), + err: err, + } + } + if len(newEvents) == 0 { + return nil, false, nil // TODO: error instead? + } + + // now check if we can fill the gap. Look to see if we have state snapshot IDs for the earliest event + earliestNewEvent := newEvents[0] + if state, err := t.db.StateAtEventIDs(ctx, []string{earliestNewEvent.EventID()}); err != nil || len(state) == 0 { + if earliestNewEvent.Type() == gomatrixserverlib.MRoomCreate && earliestNewEvent.StateKeyEquals("") { + // we got to the beginning of the room so there will be no state! It's all good we can process this + return newEvents, true, nil + } + // we don't have the state at this earliest event from /g_m_e so we won't have state for later events either + return newEvents, false, nil + } + // StateAtEventIDs returned some kind of state for the earliest event so we can fill in the gap! + return newEvents, true, nil +} + +func (t *missingStateReq) lookupMissingStateViaState(ctx context.Context, roomID, eventID string, roomVersion gomatrixserverlib.RoomVersion) ( + respState *gomatrixserverlib.RespState, err error) { + state, err := t.federation.LookupState(ctx, t.origin, roomID, eventID, roomVersion) + if err != nil { + return nil, err + } + // Check that the returned state is valid. + if err := state.Check(ctx, t.keys, nil); err != nil { + return nil, err + } + // Cache the results of this state lookup and deduplicate anything we already + // have in the cache, freeing up memory. + for i, ev := range state.AuthEvents { + state.AuthEvents[i] = t.cacheAndReturn(ev.Headered(roomVersion)).Unwrap() + } + for i, ev := range state.StateEvents { + state.StateEvents[i] = t.cacheAndReturn(ev.Headered(roomVersion)).Unwrap() + } + return &state, nil +} + +func (t *missingStateReq) lookupMissingStateViaStateIDs(ctx context.Context, roomID, eventID string, roomVersion gomatrixserverlib.RoomVersion) ( + *gomatrixserverlib.RespState, error) { + util.GetLogger(ctx).WithField("room_id", roomID).Infof("lookupMissingStateViaStateIDs %s", eventID) + // fetch the state event IDs at the time of the event + stateIDs, err := t.federation.LookupStateIDs(ctx, t.origin, roomID, eventID) + if err != nil { + return nil, err + } + // work out which auth/state IDs are missing + wantIDs := append(stateIDs.StateEventIDs, stateIDs.AuthEventIDs...) + missing := make(map[string]bool) + var missingEventList []string + t.haveEventsMutex.Lock() + for _, sid := range wantIDs { + if _, ok := t.haveEvents[sid]; !ok { + if !missing[sid] { + missing[sid] = true + missingEventList = append(missingEventList, sid) + } + } + } + t.haveEventsMutex.Unlock() + + // fetch as many as we can from the roomserver + queryReq := api.QueryEventsByIDRequest{ + EventIDs: missingEventList, + } + var queryRes api.QueryEventsByIDResponse + if err = t.queryer.QueryEventsByID(ctx, &queryReq, &queryRes); err != nil { + return nil, err + } + for i, ev := range queryRes.Events { + queryRes.Events[i] = t.cacheAndReturn(queryRes.Events[i]) + t.hadEvent(ev.EventID()) + evID := queryRes.Events[i].EventID() + if missing[evID] { + delete(missing, evID) + } + } + queryRes.Events = nil // allow it to be GCed + + concurrentRequests := 8 + missingCount := len(missing) + util.GetLogger(ctx).WithField("room_id", roomID).WithField("event_id", eventID).Infof("lookupMissingStateViaStateIDs missing %d/%d events", missingCount, len(wantIDs)) + + // If over 50% of the auth/state events from /state_ids are missing + // then we'll just call /state instead, otherwise we'll just end up + // hammering the remote side with /event requests unnecessarily. + if missingCount > concurrentRequests && missingCount > len(wantIDs)/2 { + util.GetLogger(ctx).WithFields(logrus.Fields{ + "missing": missingCount, + "event_id": eventID, + "room_id": roomID, + "total_state": len(stateIDs.StateEventIDs), + "total_auth_events": len(stateIDs.AuthEventIDs), + }).Info("Fetching all state at event") + return t.lookupMissingStateViaState(ctx, roomID, eventID, roomVersion) + } + + if missingCount > 0 { + util.GetLogger(ctx).WithFields(logrus.Fields{ + "missing": missingCount, + "event_id": eventID, + "room_id": roomID, + "total_state": len(stateIDs.StateEventIDs), + "total_auth_events": len(stateIDs.AuthEventIDs), + "concurrent_requests": concurrentRequests, + }).Info("Fetching missing state at event") + + // Create a queue containing all of the missing event IDs that we want + // to retrieve. + pending := make(chan string, missingCount) + for missingEventID := range missing { + pending <- missingEventID + } + close(pending) + + // Define how many workers we should start to do this. + if missingCount < concurrentRequests { + concurrentRequests = missingCount + } + + // Create the wait group. + var fetchgroup sync.WaitGroup + fetchgroup.Add(concurrentRequests) + + // This is the only place where we'll write to t.haveEvents from + // multiple goroutines, and everywhere else is blocked on this + // synchronous function anyway. + var haveEventsMutex sync.Mutex + + // Define what we'll do in order to fetch the missing event ID. + fetch := func(missingEventID string) { + var h *gomatrixserverlib.HeaderedEvent + h, err = t.lookupEvent(ctx, roomVersion, roomID, missingEventID, false) + switch err.(type) { + case verifySigError: + return + case nil: + break + default: + util.GetLogger(ctx).WithFields(logrus.Fields{ + "event_id": missingEventID, + "room_id": roomID, + }).Info("Failed to fetch missing event") + return + } + haveEventsMutex.Lock() + t.cacheAndReturn(h) + haveEventsMutex.Unlock() + } + + // Create the worker. + worker := func(ch <-chan string) { + defer fetchgroup.Done() + for missingEventID := range ch { + fetch(missingEventID) + } + } + + // Start the workers. + for i := 0; i < concurrentRequests; i++ { + go worker(pending) + } + + // Wait for the workers to finish. + fetchgroup.Wait() + } + + resp, err := t.createRespStateFromStateIDs(stateIDs) + return resp, err +} + +func (t *missingStateReq) createRespStateFromStateIDs(stateIDs gomatrixserverlib.RespStateIDs) ( + *gomatrixserverlib.RespState, error) { // nolint:unparam + t.haveEventsMutex.Lock() + defer t.haveEventsMutex.Unlock() + + // create a RespState response using the response to /state_ids as a guide + respState := gomatrixserverlib.RespState{} + + for i := range stateIDs.StateEventIDs { + ev, ok := t.haveEvents[stateIDs.StateEventIDs[i]] + if !ok { + logrus.Warnf("Missing state event in createRespStateFromStateIDs: %s", stateIDs.StateEventIDs[i]) + continue + } + respState.StateEvents = append(respState.StateEvents, ev.Unwrap()) + } + for i := range stateIDs.AuthEventIDs { + ev, ok := t.haveEvents[stateIDs.AuthEventIDs[i]] + if !ok { + logrus.Warnf("Missing auth event in createRespStateFromStateIDs: %s", stateIDs.AuthEventIDs[i]) + continue + } + respState.AuthEvents = append(respState.AuthEvents, ev.Unwrap()) + } + // We purposefully do not do auth checks on the returned events, as they will still + // be processed in the exact same way, just as a 'rejected' event + // TODO: Add a field to HeaderedEvent to indicate if the event is rejected. + return &respState, nil +} + +func (t *missingStateReq) lookupEvent(ctx context.Context, roomVersion gomatrixserverlib.RoomVersion, _, missingEventID string, localFirst bool) (*gomatrixserverlib.HeaderedEvent, error) { + if localFirst { + // fetch from the roomserver + queryReq := api.QueryEventsByIDRequest{ + EventIDs: []string{missingEventID}, + } + var queryRes api.QueryEventsByIDResponse + if err := t.queryer.QueryEventsByID(ctx, &queryReq, &queryRes); err != nil { + util.GetLogger(ctx).Warnf("Failed to query roomserver for missing event %s: %s - falling back to remote", missingEventID, err) + } else if len(queryRes.Events) == 1 { + return queryRes.Events[0], nil + } + } + var event *gomatrixserverlib.Event + found := false + for serverName := range t.servers { + reqctx, cancel := context.WithTimeout(ctx, time.Second*30) + defer cancel() + txn, err := t.federation.GetEvent(reqctx, serverName, missingEventID) + if err != nil || len(txn.PDUs) == 0 { + util.GetLogger(ctx).WithError(err).WithField("event_id", missingEventID).Warn("Failed to get missing /event for event ID") + if errors.Is(err, context.DeadlineExceeded) { + select { + case <-reqctx.Done(): // this server took too long + continue + case <-ctx.Done(): // the input request timed out + return nil, context.DeadlineExceeded + } + } + continue + } + event, err = gomatrixserverlib.NewEventFromUntrustedJSON(txn.PDUs[0], roomVersion) + if err != nil { + util.GetLogger(ctx).WithError(err).WithField("event_id", missingEventID).Warnf("Transaction: Failed to parse event JSON of event") + continue + } + found = true + break + } + if !found { + util.GetLogger(ctx).WithField("event_id", missingEventID).Warnf("Failed to get missing /event for event ID from %d server(s)", len(t.servers)) + return nil, fmt.Errorf("wasn't able to find event via %d server(s)", len(t.servers)) + } + if err := event.VerifyEventSignatures(ctx, t.keys); err != nil { + util.GetLogger(ctx).WithError(err).Warnf("Transaction: Couldn't validate signature of event %q", event.EventID()) + return nil, verifySigError{event.EventID(), err} + } + return t.cacheAndReturn(event.Headered(roomVersion)), nil +} + +func checkAllowedByState(e *gomatrixserverlib.Event, stateEvents []*gomatrixserverlib.Event) error { + authUsingState := gomatrixserverlib.NewAuthEvents(nil) + for i := range stateEvents { + err := authUsingState.AddEvent(stateEvents[i]) + if err != nil { + return err + } + } + return gomatrixserverlib.Allowed(e, &authUsingState) +} + +func (t *missingStateReq) hadEvent(eventID string) { + t.hadEventsMutex.Lock() + defer t.hadEventsMutex.Unlock() + t.hadEvents[eventID] = true +} + +type verifySigError struct { + eventID string + err error +} +type missingPrevEventsError struct { + eventID string + err error +} + +func (e verifySigError) Error() string { + return fmt.Sprintf("unable to verify signature of event %q: %s", e.eventID, e.err) +} +func (e missingPrevEventsError) Error() string { + return fmt.Sprintf("unable to get prev_events for event %q: %s", e.eventID, e.err) +} diff --git a/roomserver/internal/perform/perform_backfill.go b/roomserver/internal/perform/perform_backfill.go index e198f67d..f3623de8 100644 --- a/roomserver/internal/perform/perform_backfill.go +++ b/roomserver/internal/perform/perform_backfill.go @@ -77,7 +77,7 @@ func (r *Backfiller) PerformBackfill( } // Scan the event tree for events to send back. - resultNIDs, err := helpers.ScanEventTree(ctx, r.DB, *info, front, visited, request.Limit, request.ServerName) + resultNIDs, err := helpers.ScanEventTree(ctx, r.DB, info, front, visited, request.Limit, request.ServerName) if err != nil { return err } @@ -418,7 +418,7 @@ FindSuccessor: return nil } - stateEntries, err := helpers.StateBeforeEvent(ctx, b.db, *info, NIDs[eventID]) + stateEntries, err := helpers.StateBeforeEvent(ctx, b.db, info, NIDs[eventID]) if err != nil { logrus.WithField("event_id", eventID).WithError(err).Error("ServersAtEvent: failed to load state before event") return nil diff --git a/roomserver/internal/perform/perform_inbound_peek.go b/roomserver/internal/perform/perform_inbound_peek.go index 98f5f6f9..d19fc838 100644 --- a/roomserver/internal/perform/perform_inbound_peek.go +++ b/roomserver/internal/perform/perform_inbound_peek.go @@ -79,7 +79,7 @@ func (r *InboundPeeker) PerformInboundPeek( response.LatestEvent = sortedLatestEvents[0].Headered(info.RoomVersion) // XXX: do we actually need to do a state resolution here? - roomState := state.NewStateResolution(r.DB, *info) + roomState := state.NewStateResolution(r.DB, info) var stateEntries []types.StateEntry stateEntries, err = roomState.LoadStateAtSnapshot( diff --git a/roomserver/internal/perform/perform_invite.go b/roomserver/internal/perform/perform_invite.go index ca065468..85b2322f 100644 --- a/roomserver/internal/perform/perform_invite.go +++ b/roomserver/internal/perform/perform_invite.go @@ -172,7 +172,7 @@ func (r *Inviter) PerformInvite( { Kind: api.KindNew, Event: event, - AuthEventIDs: event.AuthEventIDs(), + Origin: event.Origin(), SendAsServer: req.SendAsServer, }, }, @@ -231,7 +231,7 @@ func buildInviteStrippedState( StateKey: "", }) } - roomState := state.NewStateResolution(db, *info) + roomState := state.NewStateResolution(db, info) stateEntries, err := roomState.LoadStateAtSnapshotForStringTuples( ctx, info.StateSnapshotNID, stateWanted, ) diff --git a/roomserver/internal/perform/perform_join.go b/roomserver/internal/perform/perform_join.go index 75397eb6..a1ffab5d 100644 --- a/roomserver/internal/perform/perform_join.go +++ b/roomserver/internal/perform/perform_join.go @@ -271,7 +271,6 @@ func (r *Joiner) performJoinRoomByID( { Kind: rsAPI.KindNew, Event: event.Headered(buildRes.RoomVersion), - AuthEventIDs: event.AuthEventIDs(), SendAsServer: string(r.Cfg.Matrix.ServerName), }, }, diff --git a/roomserver/internal/perform/perform_leave.go b/roomserver/internal/perform/perform_leave.go index 4daeb10a..eac528ea 100644 --- a/roomserver/internal/perform/perform_leave.go +++ b/roomserver/internal/perform/perform_leave.go @@ -139,7 +139,7 @@ func (r *Leaver) performLeaveRoomByID( { Kind: api.KindNew, Event: event.Headered(buildRes.RoomVersion), - AuthEventIDs: event.AuthEventIDs(), + Origin: event.Origin(), SendAsServer: string(r.Cfg.Matrix.ServerName), }, }, diff --git a/roomserver/internal/query/query.go b/roomserver/internal/query/query.go index 28b140c7..6b4cb581 100644 --- a/roomserver/internal/query/query.go +++ b/roomserver/internal/query/query.go @@ -63,7 +63,7 @@ func (r *Queryer) QueryStateAfterEvents( return nil } - roomState := state.NewStateResolution(r.DB, *info) + roomState := state.NewStateResolution(r.DB, info) response.RoomExists = true response.RoomVersion = info.RoomVersion @@ -294,7 +294,7 @@ func (r *Queryer) QueryMembershipsForRoom( events, err = r.DB.Events(ctx, eventNIDs) } else { - stateEntries, err = helpers.StateBeforeEvent(ctx, r.DB, *info, membershipEventNID) + stateEntries, err = helpers.StateBeforeEvent(ctx, r.DB, info, membershipEventNID) if err != nil { logrus.WithField("membership_event_nid", membershipEventNID).WithError(err).Error("failed to load state before event") return err @@ -377,7 +377,7 @@ func (r *Queryer) QueryServerAllowedToSeeEvent( return fmt.Errorf("QueryServerAllowedToSeeEvent: no room info for room %s", roomID) } response.AllowedToSeeEvent, err = helpers.CheckServerAllowedToSeeEvent( - ctx, r.DB, *info, request.EventID, request.ServerName, inRoomRes.IsInRoom, + ctx, r.DB, info, request.EventID, request.ServerName, inRoomRes.IsInRoom, ) return } @@ -416,7 +416,7 @@ func (r *Queryer) QueryMissingEvents( return fmt.Errorf("missing RoomInfo for room %s", events[0].RoomID()) } - resultNIDs, err := helpers.ScanEventTree(ctx, r.DB, *info, front, visited, request.Limit, request.ServerName) + resultNIDs, err := helpers.ScanEventTree(ctx, r.DB, info, front, visited, request.Limit, request.ServerName) if err != nil { return err } @@ -473,7 +473,7 @@ func (r *Queryer) QueryStateAndAuthChain( } var stateEvents []*gomatrixserverlib.Event - stateEvents, err = r.loadStateAtEventIDs(ctx, *info, request.PrevEventIDs) + stateEvents, err = r.loadStateAtEventIDs(ctx, info, request.PrevEventIDs) if err != nil { return err } @@ -512,7 +512,7 @@ func (r *Queryer) QueryStateAndAuthChain( return err } -func (r *Queryer) loadStateAtEventIDs(ctx context.Context, roomInfo types.RoomInfo, eventIDs []string) ([]*gomatrixserverlib.Event, error) { +func (r *Queryer) loadStateAtEventIDs(ctx context.Context, roomInfo *types.RoomInfo, eventIDs []string) ([]*gomatrixserverlib.Event, error) { roomState := state.NewStateResolution(r.DB, roomInfo) prevStates, err := r.DB.StateAtEventIDs(ctx, eventIDs) if err != nil { diff --git a/roomserver/state/state.go b/roomserver/state/state.go index 78398fc7..15d592b4 100644 --- a/roomserver/state/state.go +++ b/roomserver/state/state.go @@ -32,11 +32,11 @@ import ( type StateResolution struct { db storage.Database - roomInfo types.RoomInfo + roomInfo *types.RoomInfo events map[types.EventNID]*gomatrixserverlib.Event } -func NewStateResolution(db storage.Database, roomInfo types.RoomInfo) StateResolution { +func NewStateResolution(db storage.Database, roomInfo *types.RoomInfo) StateResolution { return StateResolution{ db: db, roomInfo: roomInfo, diff --git a/roomserver/storage/postgres/events_table.go b/roomserver/storage/postgres/events_table.go index c549fb65..778cd8d7 100644 --- a/roomserver/storage/postgres/events_table.go +++ b/roomserver/storage/postgres/events_table.go @@ -311,7 +311,9 @@ func (s *eventStatements) BulkSelectStateAtEventByID( ); err != nil { return nil, err } - if result.BeforeStateSnapshotNID == 0 { + // Genuine create events are the only case where it's OK to have no previous state. + isCreate := result.EventTypeNID == types.MRoomCreateNID && result.EventStateKeyNID == 1 + if result.BeforeStateSnapshotNID == 0 && !isCreate { return nil, types.MissingEventError( fmt.Sprintf("storage: missing state for event NID %d", result.EventNID), ) diff --git a/roomserver/storage/sqlite3/events_table.go b/roomserver/storage/sqlite3/events_table.go index 3127eb17..7483e281 100644 --- a/roomserver/storage/sqlite3/events_table.go +++ b/roomserver/storage/sqlite3/events_table.go @@ -322,7 +322,9 @@ func (s *eventStatements) BulkSelectStateAtEventByID( ); err != nil { return nil, err } - if result.BeforeStateSnapshotNID == 0 { + // Genuine create events are the only case where it's OK to have no previous state. + isCreate := result.EventTypeNID == types.MRoomCreateNID && result.EventStateKeyNID == 1 + if result.BeforeStateSnapshotNID == 0 && !isCreate { return nil, types.MissingEventError( fmt.Sprintf("storage: missing state for event NID %d", result.EventNID), ) |