diff options
author | Till <2353100+S7evinK@users.noreply.github.com> | 2022-09-07 11:44:27 +0200 |
---|---|---|
committer | GitHub <noreply@github.com> | 2022-09-07 11:44:27 +0200 |
commit | 440eb0f3a2e5aecddebe3e093d5740ce5c4505c2 (patch) | |
tree | 7d141e3fdbc7c53f314110ed00b6b6d39eeb7a98 /keyserver/internal | |
parent | 847032df36086ff6f383bcc4c332c391e86eaec8 (diff) |
Handle errors differently in the `DeviceListUpdater` (#2695)
`If a device list update goes missing, the server resyncs on the next
one` was failing because a previous test would receive a `waitTime` of
1h, resulting in the test timing out.
This now tries to handle the returned errors differently, e.g. by using
the default `waitTime` of 2s. Also doesn't try further users in the
list, if one of the errors would cause a longer `waitTime`.
Diffstat (limited to 'keyserver/internal')
-rw-r--r-- | keyserver/internal/device_list_update.go | 48 |
1 files changed, 36 insertions, 12 deletions
diff --git a/keyserver/internal/device_list_update.go b/keyserver/internal/device_list_update.go index 304b67b2..8530f907 100644 --- a/keyserver/internal/device_list_update.go +++ b/keyserver/internal/device_list_update.go @@ -19,9 +19,11 @@ import ( "encoding/json" "fmt" "hash/fnv" + "net" "sync" "time" + "github.com/matrix-org/gomatrix" "github.com/matrix-org/gomatrixserverlib" "github.com/matrix-org/util" "github.com/prometheus/client_golang/prometheus" @@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam return waitTime, true } failCount := 0 + +userLoop: for _, userID := range userIDs { if ctx.Err() != nil { // we've timed out, give up and go to the back of the queue to let another server be processed. @@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam res, err := u.fedClient.GetUserDevices(ctx, serverName, userID) if err != nil { failCount += 1 - fcerr, ok := err.(*fedsenderapi.FederationClientError) - if ok { - if fcerr.RetryAfter > 0 { - waitTime = fcerr.RetryAfter - } else if fcerr.Blacklisted { + switch e := err.(type) { + case *fedsenderapi.FederationClientError: + if e.RetryAfter > 0 { + waitTime = e.RetryAfter + break userLoop + } else if e.Blacklisted { waitTime = time.Hour * 8 - } else { - // For all other errors (DNS resolution, network etc.) wait 1 hour. - waitTime = time.Hour + break userLoop + } + case net.Error: + // Use the default waitTime, if it's a timeout. + // It probably doesn't make sense to try further users. + if !e.Timeout() { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned net.Error") + break userLoop + } + case gomatrix.HTTPError: + // The remote server returned an error, give it some time to recover + if e.Code >= 500 { + waitTime = time.Minute * 10 + logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError") + break userLoop } - } else { - waitTime = time.Hour - logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type") + default: + // Something else failed + waitTime = time.Minute * 10 + logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err) + break userLoop } continue } @@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam } } if failCount > 0 { - logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users") + logger.WithFields(logrus.Fields{ + "total": len(userIDs), + "failed": failCount, + "skipped": len(userIDs) - failCount, + }).Warn("Failed to query device keys for some users") } for _, userID := range userIDs { // always clear the channel to unblock Update calls regardless of success/failure |