@@ -101,6 +101,7 @@ import {AddressInfo} from 'net';
101
101
import fetch from 'node-fetch' ;
102
102
import * as path from 'path' ;
103
103
import * as serveStatic from 'serve-static' ;
104
+ import { v4 as uuidv4 } from 'uuid' ;
104
105
105
106
// Health checks are a little noisy in the logs, so we don't show them all.
106
107
// We show the first N health checks:
@@ -127,6 +128,29 @@ export interface FlexServerOptions {
127
128
settings ?: IGristCoreConfig ;
128
129
}
129
130
131
+ // Not to be confused with health checks from the frontend, these
132
+ // request/response pairs are internal checks between Grist instances
133
+ // in multi-server environments
134
+ interface ServerHealthcheckRequest {
135
+ id : string ;
136
+ checkReady : boolean ;
137
+ }
138
+ interface ServerHealthcheckResponse {
139
+ instanceId : string ;
140
+ requestId : string ;
141
+ healthy : boolean ;
142
+ }
143
+
144
+ // For keeping track of pending health checks for all other servers
145
+ // for each request that was broadcast to all of them.
146
+ interface PendingServerHealthCheck {
147
+ expectedCount : number ;
148
+ responses : Record < string , boolean > ;
149
+ resolve : ( res : boolean ) => void ;
150
+ reject : ( err : Error ) => void ;
151
+ timeout : NodeJS . Timeout ;
152
+ }
153
+
130
154
const noop : express . RequestHandler = ( req , res , next ) => next ( ) ;
131
155
132
156
export class FlexServer implements GristServer {
@@ -213,6 +237,8 @@ export class FlexServer implements GristServer {
213
237
private _emitNotifier = new EmitNotifier ( ) ;
214
238
private _testPendingNotifications : number = 0 ;
215
239
private _latestVersionAvailable ?: LatestVersionAvailable ;
240
+ private _instanceId : string ;
241
+ private _pendingServerHealthChecks : Map < string , PendingServerHealthCheck > ;
216
242
217
243
constructor ( public port : number , public name : string = 'flexServer' ,
218
244
public readonly options : FlexServerOptions = { } ) {
@@ -274,6 +300,9 @@ export class FlexServer implements GristServer {
274
300
this . setLatestVersionAvailable ( latestVersionAvailable ) ;
275
301
} ) ;
276
302
303
+ this . _pendingServerHealthChecks = new Map < string , PendingServerHealthCheck > ( ) ;
304
+ this . _registerGristInstance ( ) ;
305
+
277
306
// The electron build is not supported at this time, but this stub
278
307
// implementation of electronServerMethods is present to allow kicking
279
308
// its tires.
@@ -600,6 +629,34 @@ export class FlexServer implements GristServer {
600
629
if ( isParameterOn ( req . query . ready ) ) {
601
630
checks . set ( 'ready' , this . _isReady ) ;
602
631
}
632
+ if ( isParameterOn ( req . query . allInstancesReady ) ) {
633
+ const requestId = uuidv4 ( ) ;
634
+ const client = this . _pubSubManager . getClient ( ) ;
635
+
636
+ // If there is no redis, then our current instance is the only instance
637
+ const allInstances = await client ?. smembers ( 'grist-instances' ) || [ this . _instanceId ] ;
638
+
639
+ const allInstancesPromise : Promise < boolean > = new Promise ( ( resolve , reject ) => {
640
+ const allInstancesTimeout = setTimeout ( ( ) => {
641
+ reject ( new Error ( 'Timeout waiting for health responses' ) ) ;
642
+ this . _pendingServerHealthChecks . delete ( requestId ) ;
643
+ } , timeout ) ;
644
+
645
+ this . _pendingServerHealthChecks . set ( requestId , {
646
+ responses : { } ,
647
+ expectedCount : allInstances . length ,
648
+ resolve,
649
+ reject,
650
+ timeout : allInstancesTimeout ,
651
+ } ) ;
652
+ } ) ;
653
+ const request : ServerHealthcheckRequest = {
654
+ id : requestId ,
655
+ checkReady : true
656
+ } ;
657
+ await this . _pubSubManager . publish ( 'healthcheck:requests' , JSON . stringify ( request ) ) ;
658
+ checks . set ( 'allInstancesReady' , allInstancesPromise ) ;
659
+ }
603
660
let extra = '' ;
604
661
let ok = true ;
605
662
// If we had any extra check, collect their status to report them.
@@ -1061,6 +1118,7 @@ export class FlexServer implements GristServer {
1061
1118
if ( this . httpsServer ) { this . httpsServer . close ( ) ; }
1062
1119
if ( this . housekeeper ) { await this . housekeeper . stop ( ) ; }
1063
1120
if ( this . _jobs ) { await this . _jobs . stop ( ) ; }
1121
+ await this . _pubSubManager . getClient ( ) ?. srem ( 'grist-instances' , [ this . _instanceId ] ) ;
1064
1122
await this . _shutdown ( ) ;
1065
1123
if ( this . _accessTokens ) { await this . _accessTokens . close ( ) ; }
1066
1124
// Do this after _shutdown, since DocWorkerMap is used during shutdown.
@@ -2693,6 +2751,40 @@ export class FlexServer implements GristServer {
2693
2751
} ,
2694
2752
} ) ;
2695
2753
}
2754
+
2755
+ private _registerGristInstance ( ) {
2756
+ this . _instanceId = process . env . GRIST_INSTANCE_ID || `testInsanceId_${ this . port } ` ;
2757
+ this . _pubSubManager . getClient ( ) ?. sadd ( 'grist-instances' , this . _instanceId ) . catch ( ( err ) => {
2758
+ log . error ( 'Failed to contact redis' , err ) ;
2759
+ } ) ;
2760
+ this . _pubSubManager . subscribe ( 'healthcheck:requests' , async ( message ) => {
2761
+ const request : ServerHealthcheckRequest = JSON . parse ( message ) ;
2762
+ const response : ServerHealthcheckResponse = {
2763
+ instanceId : this . _instanceId || '' ,
2764
+ requestId : request . id ,
2765
+ healthy : ! request . checkReady || this . _isReady ,
2766
+ } ;
2767
+ log . debug ( `Healthcheck request` , response ) ;
2768
+ await this . _pubSubManager . publish ( 'healthcheck:responses' , JSON . stringify ( response ) ) ;
2769
+ } ) ;
2770
+
2771
+ this . _pubSubManager . subscribe ( 'healthcheck:responses' , ( message ) => {
2772
+ const response : ServerHealthcheckResponse = JSON . parse ( message ) ;
2773
+ const pending = this . _pendingServerHealthChecks . get ( response . requestId ) ;
2774
+ if ( ! pending ) {
2775
+ return ;
2776
+ }
2777
+
2778
+ pending . responses [ response . instanceId ] = response . healthy ;
2779
+
2780
+ if ( Object . keys ( pending . responses ) . length === pending . expectedCount ) {
2781
+ // All servers have replied. Make it known and clean up.
2782
+ clearTimeout ( pending . timeout ) ;
2783
+ pending . resolve ( Object . values ( pending . responses ) . every ( e => e ) ) ;
2784
+ this . _pendingServerHealthChecks . delete ( response . requestId ) ;
2785
+ }
2786
+ } ) ;
2787
+ }
2696
2788
}
2697
2789
2698
2790
/**
0 commit comments