aboutsummaryrefslogtreecommitdiff
path: root/setup
diff options
context:
space:
mode:
authorNeil Alexander <neilalexander@users.noreply.github.com>2022-10-04 13:02:41 +0100
committerNeil Alexander <neilalexander@users.noreply.github.com>2022-10-04 13:02:41 +0100
commit3da182212e86daca1d7019efbd424b2bd38b8b3c (patch)
tree3d3e4b7d52e03f69aa352aa57f04f904160c71bb /setup
parenta767102f8a74031b5d0612d985c7589dfe0e0821 (diff)
Track reasons why the process is in a degraded state
Diffstat (limited to 'setup')
-rw-r--r--setup/base/base.go8
-rw-r--r--setup/jetstream/nats.go6
-rw-r--r--setup/process/process.go35
3 files changed, 33 insertions, 16 deletions
diff --git a/setup/base/base.go b/setup/base/base.go
index 0636c7b8..2e3a3a19 100644
--- a/setup/base/base.go
+++ b/setup/base/base.go
@@ -18,6 +18,7 @@ import (
"context"
"crypto/tls"
"database/sql"
+ "encoding/json"
"fmt"
"io"
"net"
@@ -467,8 +468,13 @@ func (b *BaseDendrite) SetupAndServeHTTP(
w.WriteHeader(200)
})
b.DendriteAdminMux.HandleFunc("/monitor/health", func(w http.ResponseWriter, r *http.Request) {
- if b.ProcessContext.IsDegraded() {
+ if isDegraded, reasons := b.ProcessContext.IsDegraded(); isDegraded {
w.WriteHeader(503)
+ _ = json.NewEncoder(w).Encode(struct {
+ Warnings []string `json:"warnings"`
+ }{
+ Warnings: reasons,
+ })
return
}
w.WriteHeader(200)
diff --git a/setup/jetstream/nats.go b/setup/jetstream/nats.go
index 7409fd6c..af4eb294 100644
--- a/setup/jetstream/nats.go
+++ b/setup/jetstream/nats.go
@@ -169,9 +169,9 @@ func setupNATS(process *process.ProcessContext, cfg *config.JetStream, nc *natsc
// We've managed to add the stream in memory. What's on the
// disk will be left alone, but our ability to recover from a
// future crash will be limited. Yell about it.
- sentry.CaptureException(fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible", namespaced.Name))
- logrus.Warn("Stream is running in-memory; this may be due to data corruption in the JetStream storage directory, investigate as soon as possible")
- process.Degraded()
+ err := fmt.Errorf("Stream %q is running in-memory; this may be due to data corruption in the JetStream storage directory", namespaced.Name)
+ sentry.CaptureException(err)
+ process.Degraded(err)
}
}
}
diff --git a/setup/process/process.go b/setup/process/process.go
index 06ef6021..b2d2844a 100644
--- a/setup/process/process.go
+++ b/setup/process/process.go
@@ -2,19 +2,18 @@ package process
import (
"context"
- "fmt"
"sync"
"github.com/getsentry/sentry-go"
"github.com/sirupsen/logrus"
- "go.uber.org/atomic"
)
type ProcessContext struct {
- wg *sync.WaitGroup // used to wait for components to shutdown
- ctx context.Context // cancelled when Stop is called
- shutdown context.CancelFunc // shut down Dendrite
- degraded atomic.Bool
+ mu sync.RWMutex
+ wg *sync.WaitGroup // used to wait for components to shutdown
+ ctx context.Context // cancelled when Stop is called
+ shutdown context.CancelFunc // shut down Dendrite
+ degraded map[string]struct{} // reasons why the process is degraded
}
func NewProcessContext() *ProcessContext {
@@ -50,13 +49,25 @@ func (b *ProcessContext) WaitForComponentsToFinish() {
b.wg.Wait()
}
-func (b *ProcessContext) Degraded() {
- if b.degraded.CompareAndSwap(false, true) {
- logrus.Warn("Dendrite is running in a degraded state")
- sentry.CaptureException(fmt.Errorf("Process is running in a degraded state"))
+func (b *ProcessContext) Degraded(err error) {
+ b.mu.Lock()
+ defer b.mu.Unlock()
+ if _, ok := b.degraded[err.Error()]; !ok {
+ logrus.WithError(err).Warn("Dendrite has entered a degraded state")
+ sentry.CaptureException(err)
+ b.degraded[err.Error()] = struct{}{}
}
}
-func (b *ProcessContext) IsDegraded() bool {
- return b.degraded.Load()
+func (b *ProcessContext) IsDegraded() (bool, []string) {
+ b.mu.RLock()
+ defer b.mu.RUnlock()
+ if len(b.degraded) == 0 {
+ return false, nil
+ }
+ reasons := make([]string, 0, len(b.degraded))
+ for reason := range b.degraded {
+ reasons = append(reasons, reason)
+ }
+ return true, reasons
}