aboutsummaryrefslogtreecommitdiff
path: root/keyserver/internal
diff options
context:
space:
mode:
authorTill <2353100+S7evinK@users.noreply.github.com>2022-09-07 11:44:27 +0200
committerGitHub <noreply@github.com>2022-09-07 11:44:27 +0200
commit440eb0f3a2e5aecddebe3e093d5740ce5c4505c2 (patch)
tree7d141e3fdbc7c53f314110ed00b6b6d39eeb7a98 /keyserver/internal
parent847032df36086ff6f383bcc4c332c391e86eaec8 (diff)
Handle errors differently in the `DeviceListUpdater` (#2695)
`If a device list update goes missing, the server resyncs on the next one` was failing because a previous test would receive a `waitTime` of 1h, resulting in the test timing out. This now tries to handle the returned errors differently, e.g. by using the default `waitTime` of 2s. Also doesn't try further users in the list, if one of the errors would cause a longer `waitTime`.
Diffstat (limited to 'keyserver/internal')
-rw-r--r--keyserver/internal/device_list_update.go48
1 files changed, 36 insertions, 12 deletions
diff --git a/keyserver/internal/device_list_update.go b/keyserver/internal/device_list_update.go
index 304b67b2..8530f907 100644
--- a/keyserver/internal/device_list_update.go
+++ b/keyserver/internal/device_list_update.go
@@ -19,9 +19,11 @@ import (
"encoding/json"
"fmt"
"hash/fnv"
+ "net"
"sync"
"time"
+ "github.com/matrix-org/gomatrix"
"github.com/matrix-org/gomatrixserverlib"
"github.com/matrix-org/util"
"github.com/prometheus/client_golang/prometheus"
@@ -388,6 +390,8 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
return waitTime, true
}
failCount := 0
+
+userLoop:
for _, userID := range userIDs {
if ctx.Err() != nil {
// we've timed out, give up and go to the back of the queue to let another server be processed.
@@ -397,19 +401,35 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
res, err := u.fedClient.GetUserDevices(ctx, serverName, userID)
if err != nil {
failCount += 1
- fcerr, ok := err.(*fedsenderapi.FederationClientError)
- if ok {
- if fcerr.RetryAfter > 0 {
- waitTime = fcerr.RetryAfter
- } else if fcerr.Blacklisted {
+ switch e := err.(type) {
+ case *fedsenderapi.FederationClientError:
+ if e.RetryAfter > 0 {
+ waitTime = e.RetryAfter
+ break userLoop
+ } else if e.Blacklisted {
waitTime = time.Hour * 8
- } else {
- // For all other errors (DNS resolution, network etc.) wait 1 hour.
- waitTime = time.Hour
+ break userLoop
+ }
+ case net.Error:
+ // Use the default waitTime, if it's a timeout.
+ // It probably doesn't make sense to try further users.
+ if !e.Timeout() {
+ waitTime = time.Minute * 10
+ logrus.WithError(e).Error("GetUserDevices returned net.Error")
+ break userLoop
+ }
+ case gomatrix.HTTPError:
+ // The remote server returned an error, give it some time to recover
+ if e.Code >= 500 {
+ waitTime = time.Minute * 10
+ logrus.WithError(e).Error("GetUserDevices returned gomatrix.HTTPError")
+ break userLoop
}
- } else {
- waitTime = time.Hour
- logger.WithError(err).WithField("user_id", userID).Debug("GetUserDevices returned unknown error type")
+ default:
+ // Something else failed
+ waitTime = time.Minute * 10
+ logger.WithError(err).WithField("user_id", userID).Debugf("GetUserDevices returned unknown error type: %T", err)
+ break userLoop
}
continue
}
@@ -437,7 +457,11 @@ func (u *DeviceListUpdater) processServer(serverName gomatrixserverlib.ServerNam
}
}
if failCount > 0 {
- logger.WithField("total", len(userIDs)).WithField("failed", failCount).WithField("wait", waitTime).Warn("Failed to query device keys for some users")
+ logger.WithFields(logrus.Fields{
+ "total": len(userIDs),
+ "failed": failCount,
+ "skipped": len(userIDs) - failCount,
+ }).Warn("Failed to query device keys for some users")
}
for _, userID := range userIDs {
// always clear the channel to unblock Update calls regardless of success/failure