• Home
  • Features
  • Pricing
  • Docs
  • Announcements
  • Sign In

lightningnetwork / lnd / 10203737448

01 Aug 2024 06:26PM UTC coverage: 58.674% (+0.05%) from 58.627%
10203737448

push

github

web-flow
Merge pull request #8938 from bhandras/etcd-leader-election-fixups

multi: check leader status with our health checker to correctly shut down LND if network partitions

28 of 73 new or added lines in 6 files covered. (38.36%)

117 existing lines in 18 files now uncovered.

125392 of 213710 relevant lines covered (58.67%)

28078.2 hits per line

Source File
Press 'n' to go to next uncovered line, 'b' for previous

67.27
/healthcheck/healthcheck.go
1
// Package healthcheck contains a monitor which takes a set of liveness checks
2
// which it periodically checks. If a check fails after its configured number
3
// of allowed call attempts, the monitor will send a request to shutdown using
4
// the function is is provided in its config. Checks are dispatched in their own
5
// goroutines so that they do not block each other.
6
package healthcheck
7

8
import (
9
        "errors"
10
        "fmt"
11
        "sync"
12
        "sync/atomic"
13
        "time"
14

15
        "github.com/lightningnetwork/lnd/ticker"
16
)
17

18
var noOpCallback = func() {}
9✔
19

20
// Config contains configuration settings for our monitor.
21
type Config struct {
22
        // Checks is a set of health checks that assert that lnd has access to
23
        // critical resources.
24
        Checks []*Observation
25

26
        // Shutdown should be called to request safe shutdown on failure of a
27
        // health check.
28
        Shutdown shutdownFunc
29
}
30

31
// shutdownFunc is the signature we use for a shutdown function which allows us
32
// to print our reason for shutdown.
33
type shutdownFunc func(format string, params ...interface{})
34

35
// Monitor periodically checks a series of configured liveness checks to
36
// ensure that lnd has access to all critical resources.
37
type Monitor struct {
38
        started int32 // To be used atomically.
39
        stopped int32 // To be used atomically.
40

41
        cfg *Config
42

43
        quit chan struct{}
44
        wg   sync.WaitGroup
45
}
46

47
// NewMonitor returns a monitor with the provided config.
48
func NewMonitor(cfg *Config) *Monitor {
3✔
49
        return &Monitor{
3✔
50
                cfg:  cfg,
3✔
51
                quit: make(chan struct{}),
3✔
52
        }
3✔
53
}
3✔
54

55
// Start launches the goroutines required to run our monitor.
56
func (m *Monitor) Start() error {
3✔
57
        if !atomic.CompareAndSwapInt32(&m.started, 0, 1) {
3✔
58
                return errors.New("monitor already started")
×
59
        }
×
60

61
        // Run through all of the health checks that we have configured and
62
        // start a goroutine for each check.
63
        for _, check := range m.cfg.Checks {
5✔
64
                check := check
2✔
65

2✔
66
                // Skip over health checks that are disabled by setting zero
2✔
67
                // attempts.
2✔
68
                if check.Attempts == 0 {
2✔
69
                        log.Warnf("check: %v configured with 0 attempts, "+
×
70
                                "skipping it", check.Name)
×
71

×
72
                        continue
×
73
                }
74

75
                m.wg.Add(1)
2✔
76
                go func(check *Observation) {
4✔
77
                        defer m.wg.Done()
2✔
78

2✔
79
                        check.monitor(m.cfg.Shutdown, m.quit)
2✔
80
                }(check)
2✔
81
        }
82

83
        return nil
3✔
84
}
85

86
// Stop sends all goroutines the signal to exit and waits for them to exit.
87
func (m *Monitor) Stop() error {
3✔
88
        if !atomic.CompareAndSwapInt32(&m.stopped, 0, 1) {
3✔
89
                return fmt.Errorf("monitor already stopped")
×
90
        }
×
91

92
        log.Info("Health monitor shutting down...")
3✔
93
        defer log.Debug("Health monitor shutdown complete")
3✔
94

3✔
95
        close(m.quit)
3✔
96
        m.wg.Wait()
3✔
97

3✔
98
        return nil
3✔
99
}
100

101
// AddCheck adds a new healthcheck to our monitor.
102
func (m *Monitor) AddCheck(check *Observation) error {
1✔
103

1✔
104
        m.wg.Add(1)
1✔
105
        go func(check *Observation) {
2✔
106
                defer m.wg.Done()
1✔
107

1✔
108
                check.monitor(m.cfg.Shutdown, m.quit)
1✔
109
        }(check)
1✔
110

111
        return nil
1✔
112
}
113

114
// CreateCheck is a helper function that takes a function that produces an error
115
// and wraps it in a function that returns its result on an error channel.
116
// We do not wait group the goroutine running our checkFunc because we expect
117
// to be dealing with health checks that may block; if we wait group them, we
118
// may wait forever. Ideally future health checks will allow callers to cancel
119
// them early, and we can wait group this.
120
func CreateCheck(checkFunc func() error) func() chan error {
×
121
        return func() chan error {
×
122
                errChan := make(chan error, 1)
×
123
                go func() {
×
124
                        errChan <- checkFunc()
×
125
                }()
×
126

127
                return errChan
×
128
        }
129
}
130

131
// Observation represents a liveness check that we periodically check.
132
type Observation struct {
133
        // Name describes the health check.
134
        Name string
135

136
        // Check runs the health check itself, returning an error channel that
137
        // is expected to receive nil or an error.
138
        Check func() chan error
139

140
        // Interval is a ticker which triggers running our check function. This
141
        // ticker must be started and stopped by the observation.
142
        Interval ticker.Ticker
143

144
        // Attempts is the number of calls we make for a single check before
145
        // failing.
146
        Attempts int
147

148
        // Timeout is the amount of time we allow our check function to take
149
        // before we time it out.
150
        Timeout time.Duration
151

152
        // Backoff is the amount of time we back off between retries for failed
153
        // checks.
154
        Backoff time.Duration
155

156
        // OnSuccess is a callback which will be executed when the healthcheck
157
        // succeeds. This is optional.
158
        OnSuccess func()
159

160
        // OnFailure is a callback which will be executed when the healthcheck
161
        // fails. This is optional.
162
        OnFailure func()
163
}
164

165
// ObservationOption describes the signature of a functional option that can be
166
// used to modify the behaviour of an Observation.
167
type ObservationOption func(*Observation)
168

169
// WithSuccessCallback configures an observation with a callback to be fired
170
// whenever the health check succeeds.
171
func WithSuccessCallback(callback func()) ObservationOption {
×
172
        return func(o *Observation) {
×
173
                o.OnSuccess = callback
×
174
        }
×
175
}
176

177
// WithFailureCallback configures an observation with a callback to be fired
178
// whenever the health check reaches its failure threshold.
179
func WithFailureCallback(callback func()) ObservationOption {
×
180
        return func(o *Observation) {
×
181
                o.OnFailure = callback
×
182
        }
×
183
}
184

185
// NewObservation creates an observation.
186
func NewObservation(name string, check func() error, interval, timeout,
187
        backoff time.Duration, attempts int,
188
        opts ...ObservationOption) *Observation {
×
189

×
190
        observation := &Observation{
×
191
                Name:     name,
×
192
                Check:    CreateCheck(check),
×
193
                Interval: ticker.New(interval),
×
194
                Attempts: attempts,
×
195
                Timeout:  timeout,
×
196
                Backoff:  backoff,
×
197
        }
×
198

×
199
        // Apply each option to the observation.
×
200
        for _, opt := range opts {
×
201
                opt(observation)
×
202
        }
×
203

204
        // Ensure that we default to NO-OP callbacks.
205
        if observation.OnSuccess == nil {
×
206
                observation.OnSuccess = noOpCallback
×
207
        }
×
208

209
        if observation.OnFailure == nil {
×
210
                observation.OnFailure = noOpCallback
×
211
        }
×
212

213
        return observation
×
214
}
215

216
// String returns a string representation of an observation.
217
func (o *Observation) String() string {
26✔
218
        return o.Name
26✔
219
}
26✔
220

221
// monitor executes a health check every time its interval ticks until the quit
222
// channel signals that we should shutdown. This function is also responsible
223
// for starting and stopping our ticker.
224
func (o *Observation) monitor(shutdown shutdownFunc, quit chan struct{}) {
3✔
225
        log.Debugf("Monitoring: %v", o)
3✔
226

3✔
227
        o.Interval.Resume()
3✔
228
        defer o.Interval.Stop()
3✔
229

3✔
230
        for {
12✔
231
                select {
9✔
232
                case <-o.Interval.Ticks():
9✔
233
                        // retryCheck will return errMaxAttemptsReached when
9✔
234
                        // the max attempts are reached. In that case we will
9✔
235
                        // stop the ticker and quit.
9✔
236
                        if o.retryCheck(quit, shutdown) {
12✔
237
                                o.Debugf("max attempts failed, monitor exiting")
3✔
238
                                return
3✔
239
                        }
3✔
240

241
                // Exit if we receive the instruction to shutdown.
242
                case <-quit:
×
NEW
243
                        o.Debugf("monitor quit")
×
244
                        return
×
245
                }
246
        }
247
}
248

249
// retryCheck calls a check function until it succeeds, or we reach our
250
// configured number of attempts, waiting for our back off period between failed
251
// calls. If we fail to obtain a passing health check after the allowed number
252
// of calls, we will request shutdown. It returns a bool to indicate whether
253
// the max number of attempts is reached.
254
func (o *Observation) retryCheck(quit chan struct{},
255
        shutdown shutdownFunc) bool {
15✔
256

15✔
257
        var count int
15✔
258

15✔
259
        for count < o.Attempts {
37✔
260
                // Increment our call count and call the health check endpoint.
22✔
261
                count++
22✔
262

22✔
263
                // Wait for our check to return, timeout to elapse, or quit
22✔
264
                // signal to be received.
22✔
265
                var err error
22✔
266
                select {
22✔
267
                case err = <-o.Check():
21✔
268
                        // If our error is nil, we have passed our health check,
21✔
269
                        // so we'll invoke our success callback if defined and
21✔
270
                        // then exit.
21✔
271
                        if err == nil {
29✔
272
                                o.Debugf("invoking success callback")
8✔
273

8✔
274
                                // Invoke the success callback.
8✔
275
                                o.OnSuccess()
8✔
276

8✔
277
                                return false
8✔
278
                        }
8✔
279

280
                case <-time.After(o.Timeout):
1✔
281
                        err = fmt.Errorf("health check: %v timed out after: "+
1✔
282
                                "%v", o, o.Timeout)
1✔
283

284
                case <-quit:
×
NEW
285
                        o.Debugf("monitor quit")
×
286
                        return false
×
287
                }
288

289
                // If we have reached our allowed number of attempts, this
290
                // check has failed so we'll fire the on failure callback
291
                // and request shutdown.
292
                if count == o.Attempts {
20✔
293
                        o.Debugf("invoking failure callback")
6✔
294

6✔
295
                        o.OnFailure()
6✔
296

6✔
297
                        shutdown("Health check: %v failed after %v calls", o,
6✔
298
                                o.Attempts)
6✔
299

6✔
300
                        return true
6✔
301
                }
6✔
302

303
                o.Infof("failed with: %v, attempts: %v backing off for: %v",
8✔
304
                        err, count, o.Backoff)
8✔
305

8✔
306
                // If we are still within the number of calls allowed for this
8✔
307
                // check, we wait for our back off period to elapse, or exit if
8✔
308
                // we get the signal to shutdown.
8✔
309
                select {
8✔
310
                case <-time.After(o.Backoff):
8✔
311

312
                case <-quit:
×
NEW
313
                        o.Debugf("monitor quit")
×
314
                        return false
×
315
                }
316
        }
317

318
        return false
1✔
319
}
320

321
// Infof logs an info message for an observation prefixed with the health check
322
// name.
323
func (o *Observation) Infof(format string, params ...interface{}) {
8✔
324
        log.Debugf(fmt.Sprintf("Health check: %v ", o)+format, params...)
8✔
325
}
8✔
326

327
// Debugf logs a debug message for an observation prefixed with the health check
328
// name.
329
func (o *Observation) Debugf(format string, params ...interface{}) {
17✔
330
        log.Debugf(fmt.Sprintf("Health check: %v ", o)+format, params...)
17✔
331
}
17✔
STATUS · Troubleshooting · Open an Issue · Sales · Support · CAREERS · ENTERPRISE · START FREE · SCHEDULE DEMO
ANNOUNCEMENTS · TWITTER · TOS & SLA · Supported CI Services · What's a CI service? · Automated Testing

© 2025 Coveralls, Inc