-
-
Notifications
You must be signed in to change notification settings - Fork 466
Multi-server pubsub healthcheck endpoint #1730
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
1cebbf0
f11e75b
4e9a3f7
da702d4
2b77efb
26eba14
6c39891
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,140 @@ | ||
import {GristServer} from 'app/server/lib/GristServer'; | ||
import log from 'app/server/lib/log'; | ||
import {createPubSubManager, IPubSubManager} from 'app/server/lib/PubSubManager'; | ||
import * as shutdown from 'app/server/lib/shutdown'; | ||
|
||
import {v4 as uuidv4} from 'uuid'; | ||
|
||
// Not to be confused with health checks from the frontend, these | ||
// request/response pairs are internal checks between Grist instances | ||
// in multi-server environments | ||
interface ServerHealthcheckRequest { | ||
id: string; | ||
instanceId: string; | ||
checkReady: boolean; | ||
} | ||
interface ServerHealthcheckResponse { | ||
instanceId: string; | ||
requestId: string; | ||
healthy: boolean; | ||
} | ||
|
||
// For keeping track of pending health checks for all other servers | ||
// for each request that was broadcast to all of them. | ||
interface PendingServerHealthCheck { | ||
expectedCount: number; | ||
responses: Record<string, boolean>; | ||
resolve: (res: boolean) => void; | ||
reject: (err: Error) => void; | ||
timeout: NodeJS.Timeout; | ||
} | ||
|
||
/** This class uses pubsub via Redis, if available, to register this | ||
* Grist instance and check that all other instances are healthy. | ||
* | ||
* In single-server instances, it also works without Redis, leveraging | ||
* the dummy defaults of `PubSubManager`. | ||
*/ | ||
export class HealthChecker { | ||
private _pendingServerHealthChecks: Map<string, PendingServerHealthCheck>; | ||
private _serverInstanceID: string; | ||
private _pubSubManager: IPubSubManager; | ||
|
||
constructor( | ||
private _server: GristServer | ||
) { | ||
this._pubSubManager = createPubSubManager(process.env.REDIS_URL); | ||
this._pendingServerHealthChecks = new Map<string, PendingServerHealthCheck>(); | ||
this._serverInstanceID = process.env.GRIST_INSTANCE_ID || `testInsanceId_${this._server.getHost()}`; | ||
this._pubSubManager.getClient()?.sadd('grist-instances', this._serverInstanceID).catch((err) => { | ||
log.error('Failed to contact redis', err); | ||
}); | ||
this._subscribeToChannels(); | ||
|
||
// Make sure we clean up our Redis mess, if any, even if we exit | ||
// by signal. | ||
shutdown.addCleanupHandler(null, () => this.close()); | ||
} | ||
|
||
|
||
/** This returns a promise that resolves to `true` when all other | ||
* registered instances must respond as healthy within the given | ||
* timeout. | ||
* | ||
* @param {number} timeout - number of milliseconds to wait for | ||
* responses from all servers before timeout | ||
* | ||
* @param {boolean} checkReady - whether to insist on `ready` status | ||
* or just a simple health check | ||
*/ | ||
public async allServersOkay(timeout: number, checkReady: boolean): Promise<boolean> { | ||
const requestId = uuidv4(); | ||
const client = this._pubSubManager.getClient(); | ||
|
||
// If there is no Redis, then our current instance is the only instance | ||
const allInstances = await client?.smembers('grist-instances') || [this._serverInstanceID]; | ||
|
||
const allInstancesPromise: Promise<boolean> = new Promise((resolve: (res: boolean) => void, reject) => { | ||
const allInstancesTimeout = setTimeout(() => { | ||
log.warn('allServersOkay: timeout waiting for responses'); | ||
reject(new Error('Timeout waiting for health responses')); | ||
this._pendingServerHealthChecks.delete(requestId); | ||
}, timeout); | ||
|
||
this._pendingServerHealthChecks.set(requestId, { | ||
responses: {}, | ||
expectedCount: allInstances.length, | ||
resolve, | ||
reject, | ||
timeout: allInstancesTimeout, | ||
}); | ||
}).catch(() => false); | ||
const request: ServerHealthcheckRequest = { | ||
id: requestId, | ||
instanceId: this._serverInstanceID, | ||
checkReady, | ||
}; | ||
await this._pubSubManager.publish('healthcheck:requests', JSON.stringify(request)); | ||
return allInstancesPromise; | ||
} | ||
|
||
public async close() { | ||
await this._pubSubManager.getClient()?.srem('grist-instances', [this._serverInstanceID]); | ||
await this._pubSubManager.close(); | ||
} | ||
|
||
private _subscribeToChannels() { | ||
this._pubSubManager.subscribe('healthcheck:requests', async (message) => { | ||
Check failure on line 107 in app/server/lib/HealthChecker.ts
|
||
const request: ServerHealthcheckRequest = JSON.parse(message); | ||
const response: ServerHealthcheckResponse = { | ||
instanceId: this._serverInstanceID|| '', | ||
requestId: request.id, | ||
healthy: !request.checkReady || this._server.ready, | ||
}; | ||
log.debug('allServersOkay request', response); | ||
await this._pubSubManager.publish(`healthcheck:responses-${request.instanceId}`, JSON.stringify(response)); | ||
}); | ||
|
||
this._pubSubManager.subscribe(`healthcheck:responses-${this._serverInstanceID}`, (message) => { | ||
Check failure on line 118 in app/server/lib/HealthChecker.ts
|
||
const response: ServerHealthcheckResponse = JSON.parse(message); | ||
const pending = this._pendingServerHealthChecks.get(response.requestId); | ||
if (!pending) { | ||
// This instance didn't broadcast a health check request with | ||
// this requestId, so nothing to do. | ||
return; | ||
} | ||
|
||
pending.responses[response.instanceId] = response.healthy; | ||
log.debug( | ||
`allServersOkay cleared pending response on ${this._serverInstanceID} for ${response.instanceId}` | ||
); | ||
|
||
if (Object.keys(pending.responses).length === pending.expectedCount) { | ||
// All servers have replied. Make it known and clean up. | ||
clearTimeout(pending.timeout); | ||
pending.resolve(Object.values(pending.responses).every(e => e)); | ||
this._pendingServerHealthChecks.delete(response.requestId); | ||
} | ||
}); | ||
} | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you add a doc-comment to this file that describes how these work, and why it's done this way? In particular, I assume it's a deliberate choice to rely on redis rather than, say, call each server's /status endpoint -- is that because we don't want to assume that servers can communicate with each other directly? That also means that this check doesn't reflect the ability for the servers to communicate with each other, right?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Sure, I'll write something.
How would servers know what other servers exist without Redis?
And since we're already using Redis to communicate between doc workers, it seemed like a natural extension to communicate with all Grist instances.
Does that make sense?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Well... this isn't entirely convincing. But I might not fully understand the goal of this check. Is it about checking whether a multi-server setup works correctly, or is it more narrow than that, like how many home-server or doc-workers are running?
Good question, even without the "Redis" part. Is this check intended to verify that servers know about each other for the purpose of their functionality, or only for the purpose of the health check?
We use Redis to coordinate assignments of docs to workers, and for some other things, like sessions and notifications, but for most traffic between home servers and doc workers, I think we rely on them being able to make HTTP requests to one another.
I am asking these questions to understand the purposes and make sure we are on the same page, but not saying this should be part of this healthcheck -- I can imagine that when setting up, there is value to know that, say, 3 doc-workers are running and healthy, even if the networking part of communicating to them isn't yet working. Is that the goal?
One more question: if I restart some servers, do we want this check to tell me how many got successfully restarted? Just knowing how many are healthy doesn't tell me if I have a mix of old and new ones.
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The ultimate goal here is to be able to restart all the instances. That will be the next step. The restart is a button on the admin, but currently it only restarts whatever server gets hit with the request to restart.
After restarting all the instances, how does the web browser know when it's time to reload the page and get the newly restarted Grist page?
I decided it should be when all of the servers were restarted and ready to serve requests. That's how I arrived to this design. If we reloaded the page without all servers being restarted, we might be in some ambiguous state where some servers aren't ready yet.
Not explicitly. This is using the
ready
status of an instance which is a little different fromhealthy
. Thehealthy
state just means that thestatus
endpoint is ready, which happens very early during Grist initialisation. This new endpoint instead checks that all instances that are registered on Redis have hit theready
status, which is at the end of the initialisation of an instance, one of the last things thatMergedServer
does. Theready
status should mean that networking is also ready. Of course, if you have some networking problem between your servers where they can't even reach each other or Redis, this check won't catch that.As to the choice of using Redis instead of internal http requests between the Grist instances, it just seemed easier to me to use a shared Redis channel since Redis seemed inevitable for discovery to begin with.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If restart is the main goal, then I think my last question from previous comment might be relevant?
Uh oh!
There was an error while loading. Please reload this page.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, so it seems to me like the problem of an instance spontaneously dying already requires infrastructure not directly shipped with Grist Core. Maybe we can offer advice on how to handle this problem or supply this infrastructure outside of this PR.
What I can add here is an extra endpoint to instruct a server to clean up the registry of instances on Redis.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think you are right, and this comment comes closest to explaining why I feel this whole approach isn't the right one for checking health of a multi-server installation. The issue is that not one part of this knows anything about which servers are expected to be a part of the installation. That knowledge is presumably in some external infrastructure, but then only with the help of that external infrastructure can you know the health of the installation.
E.g. let's say you start a cluster with 3 home servers and 10 doc workers, and you forgot to configure Redis for doc-workers. Then home servers register with Redis, doc-workers don't, and the healthcheck done this way will report that everything is healthy, when it's not even close. Or let's say everything is up and connected to Redis, and you updated some config and told everything to restart, but half your servers didn't react to your restart request, and are still running with the old config (e.g. wrong activation key, or wrong version, etc). Still, this healthcheck will report healthy. If you messed up and got two home servers running that point to different Redis URLs, whichever one the LB happens to ask will each independently report it is healthy, though the setup is broken.
All this approach does is check whether servers that successfully connected to Redis and registered themselves are still connected. Even for the narrow situation when everything is up and clean and registered in Redis, and there is nothing stale, does this help with restarts? As soon as you tell all servers to restart, they'll unregister themselves before exiting, and the healthcheck will return "all clear" even if they don't start up again, since it only checks servers that have successfully started and registered themselves.
I'm just making up scenarios, but fundamentally, it seems like anyone running an actual multi-server installation must have some external infrastructure to know what the servers are intended to be running. If we try to build tools to manage a cluster (even e.g. turn on "enterprise") without at all being aware of this infrastructure, or knowing which servers the admin intends to be affected, I think we might make the job of the administrator harder rather than easier.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Tracing the problem back, "turning on enterprise" converted to a "reboot all instances" problem because of when configuration is loaded in the code. One could imagine changing that. But for an admin panel, if we expect in the future to offer ways to change configuration other than the enterprise toggle, the ability to reload the node process everywhere does feel like a good investment, so we don't have to be policing every PR that reads configuration. And being able to display some state about the instances also seems obviously useful.
But it is true that knowing the set of live instances in order to coordinate with them is awkward. Registering/deregistering is brittle. Heartbeats on a channel that e.g. home servers listen to and maintain some state for are an alternative, but would also have some failure modes.
For a configuration change, you could imagine the following process:
{id: ..., version: ...}
heartbeat from time to time onheartbeat
channel.heartbeats
object.{action: "restart", version: ...}
message on achatter
channel everyone listens to. Version code is invented, to represent the config change.Rather than thinking about health, you can think about whether all servers issuing heartbeats are known to be using the latest config.
Needs fleshing out and has problems, just a different direction I was thinking about. But a reasonable alternative could also be to just back off from supporting an enterprise toggle in multi-server installations (but how do you know they are multi-server? use existence of redis as a hint?) and just give instructions.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, it makes a lot more sense if we step away from "is cluster healthy" question, and instead ask something like: "are all servers that are connected to Redis on the expected version of config?"
Also, by exposing more information than a boolean (e.g. "list of all servers connected to Redis"), we'd make it possible for an admin to judge health by comparing that list to what they know about how many servers they expect to be up.
Am I right that this is the first foray into adding tools within Grist to make it easier for self-hosters to run multi-server installations? It would be good if it was a start that made further improvements easier. We know there are a number of difficult things about multi-server, including getting servers to be able to talk to each other (configuring
APP_DOC_INTERNAL_URL
and the like) and unregistering crashed doc-workers. What seems useful is to have all servers (doc-worker and home-servers) report their info. Maybe instead of broadcasting a heartbeat on Redis pubsub, they can update periodically their info in Redis: including start time, version of Grist, version of config (whatever that means), internal URL, status bits for things a server knows how to check for in its usual/status
endpoint (like DB connectivity), and the timestamp of this update. This would be enough information to answer a lot of questions, particularly in combination with other info (like knowledge of expected servers; results of pings to internal urls; contents ofworkers-available
key holding doc-worker registrations, etc)For the restart task, a home server requesting a restart would only have to poll this info for up to a timeout, until it sees that everyone is reporting the expected version of config.
Setting data in Redis has the advantage over broadcasting is that a server (or even a separate tool) can do a check even if it hasn't been subscribed to pubsub since the right point in time. E.g. if a home server requesting a restart is itself restarting, or if a separate page load of Admin Panel is asking a different home server for whether the config change has taken effect.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That makes sense!
Exactly, that was my hope. Until now, multi-server installations have been ignored.