gristlabs · jordigh · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025 · Jul 29, 2025
diff --git a/app/common/ConfigAPI.ts b/app/common/ConfigAPI.ts
@@ -26,7 +26,7 @@ export class ConfigAPI extends BaseAPI {
   }
 
   public async healthcheck(): Promise<void> {
-    const resp = await this.request(`${this._homeUrl}/status?ready=1`);
+    const resp = await this.request(`${this._homeUrl}/status?allInstancesReady=1`);
     if (!resp.ok) {
       throw new Error(await resp.text());
     }

diff --git a/app/server/MergedServer.ts b/app/server/MergedServer.ts
@@ -206,7 +206,7 @@ export class MergedServer {
       await this.flexServer.finalizePlugins(this.hasComponent("home") ? checkUserContentPort() : null);
       this.flexServer.checkOptionCombinations();
       this.flexServer.summary();
-      this.flexServer.setReady(true);
+      this.flexServer.ready = true;
 
       if (this._options.extraWorkers) {
         if (!process.env.REDIS_URL) {

diff --git a/app/server/lib/FlexServer.ts b/app/server/lib/FlexServer.ts
@@ -101,6 +101,7 @@ import {AddressInfo} from 'net';
 import fetch from 'node-fetch';
 import * as path from 'path';
 import * as serveStatic from 'serve-static';
+import { HealthChecker } from './HealthChecker';
 
 // Health checks are a little noisy in the logs, so we don't show them all.
 // We show the first N health checks:
@@ -213,6 +214,7 @@ export class FlexServer implements GristServer {
   private _emitNotifier = new EmitNotifier();
   private _testPendingNotifications: number = 0;
   private _latestVersionAvailable?: LatestVersionAvailable;
+  private _healthChecker: HealthChecker;
 
   constructor(public port: number, public name: string = 'flexServer',
               public readonly options: FlexServerOptions = {}) {
@@ -274,6 +276,8 @@ export class FlexServer implements GristServer {
       this.setLatestVersionAvailable(latestVersionAvailable);
     });
 
+    this._healthChecker = new HealthChecker(this);
+
     // The electron build is not supported at this time, but this stub
     // implementation of electronServerMethods is present to allow kicking
     // its tires.
@@ -600,6 +604,9 @@ export class FlexServer implements GristServer {
       if (isParameterOn(req.query.ready)) {
         checks.set('ready', this._isReady);
       }
+      if (isParameterOn(req.query.allInstancesReady)) {
+        checks.set('allInstancesReady', this._healthChecker.allServersOkay(timeout, true));
+      }
       let extra = '';
       let ok = true;
       // If we had any extra check, collect their status to report them.
@@ -1059,6 +1066,7 @@ export class FlexServer implements GristServer {
     if (this.httpsServer) { this.httpsServer.close(); }
     if (this.housekeeper) { await this.housekeeper.stop(); }
     if (this._jobs)       { await this._jobs.stop(); }
+    await this._healthChecker.close();
     await this._shutdown();
     if (this._accessTokens) { await this._accessTokens.close(); }
     // Do this after _shutdown, since DocWorkerMap is used during shutdown.
@@ -1890,7 +1898,7 @@ export class FlexServer implements GristServer {
     }
   }
 
-  public setReady(value: boolean) {
+  public set ready(value: boolean) {
     if(value) {
       log.debug('FlexServer is ready');
     } else {
@@ -1899,6 +1907,10 @@ export class FlexServer implements GristServer {
     this._isReady = value;
   }
 
+  public get ready() {
+    return this._isReady;
+  }
+
   public checkOptionCombinations() {
     // Check for some bad combinations we should warn about.
     const allowedWebhookDomains = appSettings.section('integrations').flag('allowedWebhookDomains').readString({

diff --git a/app/server/lib/GristServer.ts b/app/server/lib/GristServer.ts
@@ -52,6 +52,7 @@ export interface StorageCoordinator {
 export interface GristServer extends StorageCoordinator {
   readonly create: ICreate;
   readonly testPending: boolean;
+  ready: boolean;
   settings?: IGristCoreConfig;
   getHost(): string;
   getHomeUrl(req: express.Request, relPath?: string): string;
@@ -103,7 +104,6 @@ export interface GristServer extends StorageCoordinator {
   isRestrictedMode(): boolean;
   onUserChange(callback: (change: UserChange) => Promise<void>): void;
   onStreamingDestinationsChange(callback: (orgId?: number) => Promise<void>): void;
-  setReady(value: boolean): void;
 }
 
 export interface GristLoginSystem {
@@ -163,6 +163,7 @@ export function createDummyGristServer(): GristServer {
   return {
     create,
     testPending: false,
+    ready: true,
     settings: loadGristCoreConfig(),
     getHost() { return 'localhost:4242'; },
     getHomeUrl() { return 'http://localhost:4242'; },
@@ -214,7 +215,6 @@ export function createDummyGristServer(): GristServer {
     onUserChange() { /* do nothing */ },
     onStreamingDestinationsChange() { /* do nothing */ },
     hardDeleteDoc() { return Promise.resolve(); },
-    setReady() { /* do nothing */ },
   };
 }
 

diff --git a/app/server/lib/HealthChecker.ts b/app/server/lib/HealthChecker.ts
@@ -0,0 +1,140 @@
+import {GristServer} from 'app/server/lib/GristServer';
+import log from 'app/server/lib/log';
+import {createPubSubManager, IPubSubManager} from 'app/server/lib/PubSubManager';
+import * as shutdown from 'app/server/lib/shutdown';
+
+import {v4 as uuidv4} from 'uuid';
+
+// Not to be confused with health checks from the frontend, these
+// request/response pairs are internal checks between Grist instances
+// in multi-server environments
+interface ServerHealthcheckRequest {
+  id: string;
+  instanceId: string;
+  checkReady: boolean;
+}
+interface ServerHealthcheckResponse {
+  instanceId: string;
+  requestId: string;
+  healthy: boolean;
+}
+
+// For keeping track of pending health checks for all other servers
+// for each request that was broadcast to all of them.
+interface PendingServerHealthCheck {
+  expectedCount: number;
+  responses: Record<string, boolean>;
+  resolve: (res: boolean) => void;
+  reject: (err: Error) => void;
+  timeout: NodeJS.Timeout;
+}
+
+/** This class uses pubsub via Redis, if available, to register this
+ *  Grist instance and check that all other instances are healthy.
+ *
+ *  In single-server instances, it also works without Redis, leveraging
+ *  the dummy defaults of `PubSubManager`.
+ */
+export class HealthChecker {
+  private _pendingServerHealthChecks: Map<string, PendingServerHealthCheck>;
+  private _serverInstanceID: string;
+  private _pubSubManager: IPubSubManager;
+
+  constructor(
+    private _server: GristServer
+  ) {
+    this._pubSubManager = createPubSubManager(process.env.REDIS_URL);
+    this._pendingServerHealthChecks = new Map<string, PendingServerHealthCheck>();
+    this._serverInstanceID = process.env.GRIST_INSTANCE_ID || `testInsanceId_${this._server.getHost()}`;
+    this._pubSubManager.getClient()?.sadd('grist-instances', this._serverInstanceID).catch((err) => {
+      log.error('Failed to contact redis', err);
+    });
+    this._subscribeToChannels();
+
+    // Make sure we clean up our Redis mess, if any, even if we exit
+    // by signal.
+    shutdown.addCleanupHandler(null, () => this.close());
+  }
+
+
+  /** This returns a promise that resolves to `true` when all other
+   *  registered instances must respond as healthy within the given
+   *  timeout.
+   *
+   *  @param {number} timeout - number of milliseconds to wait for
+   *                            responses from all servers before timeout
+   *
+   *  @param {boolean} checkReady - whether to insist on `ready` status
+   *                                or just a simple health check
+   */
+  public async allServersOkay(timeout: number, checkReady: boolean): Promise<boolean> {
+    const requestId = uuidv4();
+    const client = this._pubSubManager.getClient();
+
+    // If there is no Redis, then our current instance is the only instance
+    const allInstances = await client?.smembers('grist-instances') || [this._serverInstanceID];
+
+    const allInstancesPromise: Promise<boolean> = new Promise((resolve: (res: boolean) => void, reject) => {
+      const allInstancesTimeout = setTimeout(() => {
+        log.warn('allServersOkay: timeout waiting for responses');
+        reject(new Error('Timeout waiting for health responses'));
+        this._pendingServerHealthChecks.delete(requestId);
+      }, timeout);
+
+      this._pendingServerHealthChecks.set(requestId, {
+        responses: {},
+        expectedCount: allInstances.length,
+        resolve,
+        reject,
+        timeout: allInstancesTimeout,
+      });
+    }).catch(() => false);
+    const request: ServerHealthcheckRequest = {
+      id: requestId,
+      instanceId: this._serverInstanceID,
+      checkReady,
+    };
+    await this._pubSubManager.publish('healthcheck:requests', JSON.stringify(request));
+    return allInstancesPromise;
+  }
+
+  public async close() {
+    await this._pubSubManager.getClient()?.srem('grist-instances', [this._serverInstanceID]);
+    await this._pubSubManager.close();
+  }
+
+  private _subscribeToChannels() {
+    this._pubSubManager.subscribe('healthcheck:requests', async (message) => {
+      const request: ServerHealthcheckRequest = JSON.parse(message);
+      const response: ServerHealthcheckResponse = {
+        instanceId: this._serverInstanceID|| '',
+        requestId: request.id,
+        healthy: !request.checkReady || this._server.ready,
+      };
+      log.debug('allServersOkay request', response);
+      await this._pubSubManager.publish(`healthcheck:responses-${request.instanceId}`, JSON.stringify(response));
+    });
+
+    this._pubSubManager.subscribe(`healthcheck:responses-${this._serverInstanceID}`, (message) => {
+      const response: ServerHealthcheckResponse = JSON.parse(message);
+      const pending = this._pendingServerHealthChecks.get(response.requestId);
+      if (!pending) {
+        // This instance didn't broadcast a health check request with
+        // this requestId, so nothing to do.
+        return;
+      }
+
+      pending.responses[response.instanceId] = response.healthy;
+      log.debug(
+        `allServersOkay cleared pending response on ${this._serverInstanceID} for ${response.instanceId}`
+      );
+
+      if (Object.keys(pending.responses).length === pending.expectedCount) {
+        // All servers have replied. Make it known and clean up.
+        clearTimeout(pending.timeout);
+        pending.resolve(Object.values(pending.responses).every(e => e));
+        this._pendingServerHealthChecks.delete(response.requestId);
+      }
+    });
+  }
+}
diff --git a/app/server/lib/PubSubManager.ts b/app/server/lib/PubSubManager.ts
@@ -31,6 +31,7 @@ export interface IPubSubManager {
   subscribe(channel: string, callback: Callback): UnsubscribeCallbackPromise;
   publish(channel: string, message: string): Promise<void>;
   publishBatch(batch: Array<{channel: string, message: string}>): Promise<void>;
+  getClient(): IORedis|undefined;
 }
 
 export type Callback = (message: string) => void;
@@ -106,6 +107,8 @@ abstract class PubSubManagerBase implements IPubSubManager {
    */
   public abstract publishBatch(batch: Array<{channel: string, message: string}>): Promise<void>;
 
+  public abstract getClient(): IORedis|undefined;
+
   protected abstract _redisSubscribe(channel: string): Promise<void>;
   protected abstract _redisUnsubscribe(channel: string): Promise<void>;
 
@@ -133,6 +136,7 @@ class PubSubManagerNoRedis extends PubSubManagerBase {
   public async publishBatch(batch: Array<{channel: string, message: string}>) {
     batch.forEach(({channel, message}) => this._deliverMessage(channel, message));
   }
+  public getClient(): IORedis|undefined { return; }
   protected async _redisSubscribe(channel: string): Promise<void> {}
   protected async _redisUnsubscribe(channel: string): Promise<void> {}
 }
@@ -182,6 +186,12 @@ class PubSubManagerRedis extends PubSubManagerBase {
     await pipeline.exec();
   }
 
+  public getClient(): IORedis|undefined {
+    // The redisSub client is already tied listening to a channel, but
+    // the redisPub is "free" for the client to mess around with.
+    return this._redisPub;
+  }
+
   protected async _redisSubscribe(channel: string): Promise<void> {
     await this._redisSub.subscribe(this._prefixChannel(channel));
   }

diff --git a/app/server/lib/attachEarlyEndpoints.ts b/app/server/lib/attachEarlyEndpoints.ts
@@ -113,7 +113,7 @@ export function attachEarlyEndpoints(options: AttachOptions) {
         });
       }
       // We're going down, so we're no longer ready to serve requests.
-      gristServer.setReady(false);
+      gristServer.ready = false;
       return res.status(200).send({ msg: "ok" });
     })
   );

diff --git a/test/gen-server/lib/HealthCheck.ts b/test/gen-server/lib/HealthCheck.ts
@@ -1,7 +1,7 @@
 import { assert } from 'chai';
 import fetch from 'node-fetch';
 import { TestServer } from 'test/gen-server/apiUtils';
-import { TcpForwarder } from 'test/server/tcpForwarder';
+import { RedisForwarder } from 'test/server/tcpForwarder';
 import * as testUtils from 'test/server/testUtils';
 import { waitForIt } from 'test/server/wait';
 
@@ -12,22 +12,14 @@ describe('HealthCheck', function() {
     describe(serverType, function() {
       let server: TestServer;
       let oldEnv: testUtils.EnvironmentSnapshot;
-      let redisForwarder: TcpForwarder;
+      let redisForwarder: RedisForwarder;
 
       before(async function() {
         oldEnv = new testUtils.EnvironmentSnapshot();
 
-        // We set up Redis via a TcpForwarder, so that we can simulate disconnects.
-        if (!process.env.TEST_REDIS_URL) {
-          throw new Error("TEST_REDIS_URL is expected");
-        }
-        const redisUrl = new URL(process.env.TEST_REDIS_URL);
-        const redisPort = parseInt(redisUrl.port, 10) || 6379;
-        redisForwarder = new TcpForwarder(redisPort, redisUrl.host);
-        const forwarderPort = await redisForwarder.pickForwarderPort();
-        await redisForwarder.connect();
-
-        process.env.REDIS_URL = `redis://localhost:${forwarderPort}`;
+        // We set up Redis via a forwarder, so that we can simulate disconnects.
+        redisForwarder = await RedisForwarder.create();
+        process.env.REDIS_URL = `redis://localhost:${redisForwarder.port}`;
         server = new TestServer(this);
         await server.start([serverType]);
       });

diff --git a/test/server/lib/Authorizer.ts b/test/server/lib/Authorizer.ts
@@ -43,7 +43,7 @@ async function activateServer(home: FlexServer, docManager: DocManager) {
   home.addApiErrorHandlers();
   home.finalizeEndpoints();
   await home.finalizePlugins(null);
-  home.setReady(true);
+  home.ready = true;
   serverUrl = home.getOwnUrl();
 }