mirror of
https://github.com/openfrontio/OpenFrontIO.git
synced 2026-06-21 12:20:46 +00:00
feat(server): add health api endpoint for increased observability (#3264)
## Description: Adds an additional API endpoint to the server for health, using the master lobby service as the health metric. The master lobby service is considered healthy if the lobby service has started (i.e. it had enough ready workers to start), and the current amount of ready workers is more than half of the desired number. This means that we won't show as healthy until all the workers start, and then we will continue to show as healthy even if a few workers crash, as long as at least more than half are still running. Any less than that, and the service becomes unhealthy. This also is set to "no cache" in the nginx config. This is to ensure that any checks of the server health show the true value, and cannot show false/stale data served by nginx, cloudflare, or anything else. ## Please complete the following: - [x] I have added screenshots for all UI updates - [x] I process any text displayed to the user through translateText() and I've added it to the en.json file - [x] I have added relevant tests to the test directory - [x] I confirm I have thoroughly tested these changes and take full responsibility for any bugs introduced ## Please put your Discord username so you can be contacted if a bug or regression is found: jish
This commit is contained in:
+20
-1
@@ -120,6 +120,25 @@ server {
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# /api/health endpoint - No caching, always hit the backend
|
||||
location = /api/health {
|
||||
proxy_pass http://127.0.0.1:3000;
|
||||
proxy_http_version 1.1;
|
||||
|
||||
# Cache configuration - No caching for health checks
|
||||
proxy_cache off;
|
||||
add_header X-Cache-Status "BYPASS";
|
||||
add_header Cache-Control "no-store, no-cache, must-revalidate, proxy-revalidate";
|
||||
add_header Pragma "no-cache";
|
||||
add_header Expires "0";
|
||||
|
||||
# Standard proxy headers
|
||||
proxy_set_header Host $host;
|
||||
proxy_set_header X-Real-IP $remote_addr;
|
||||
proxy_set_header X-Forwarded-For $proxy_add_x_forwarded_for;
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
# /commit.txt endpoint - Cache for 5 seconds
|
||||
location = /commit.txt {
|
||||
@@ -250,4 +269,4 @@ server {
|
||||
proxy_set_header X-Forwarded-Proto $scheme;
|
||||
}
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
@@ -145,6 +145,15 @@ app.get("/api/env", async (req, res) => {
|
||||
res.json(envConfig);
|
||||
});
|
||||
|
||||
app.get("/api/health", (_req, res) => {
|
||||
const ready = lobbyService?.isHealthy() ?? false;
|
||||
if (ready) {
|
||||
res.json({ status: "ok" });
|
||||
} else {
|
||||
res.status(503).json({ status: "unavailable" });
|
||||
}
|
||||
});
|
||||
|
||||
// SPA fallback route
|
||||
app.get("*", async function (_req, res) {
|
||||
try {
|
||||
|
||||
@@ -59,6 +59,13 @@ export class MasterLobbyService {
|
||||
this.readyWorkers.delete(workerId);
|
||||
}
|
||||
|
||||
isHealthy(): boolean {
|
||||
// We consider the lobby service healthy if at least half of the workers are ready.
|
||||
// This allows for some leeway if a worker crashes.
|
||||
const minWorkers = Math.max(this.config.numWorkers() / 2, 1);
|
||||
return this.started && this.readyWorkers.size >= minWorkers;
|
||||
}
|
||||
|
||||
private handleWorkerReady(workerId: number) {
|
||||
this.readyWorkers.add(workerId);
|
||||
this.log.info(
|
||||
|
||||
@@ -0,0 +1,119 @@
|
||||
import EventEmitter from "events";
|
||||
import { describe, expect, it, vi } from "vitest";
|
||||
import { MasterLobbyService } from "../../src/server/MasterLobbyService";
|
||||
import { TestServerConfig } from "../util/TestServerConfig";
|
||||
|
||||
vi.mock("../../src/server/Logger", () => ({
|
||||
logger: {
|
||||
child: () => ({
|
||||
error: vi.fn(),
|
||||
info: vi.fn(),
|
||||
}),
|
||||
},
|
||||
}));
|
||||
|
||||
vi.mock("../../src/server/PollingLoop", () => ({
|
||||
startPolling: vi.fn(),
|
||||
}));
|
||||
|
||||
function createMockWorker(): EventEmitter {
|
||||
const emitter = new EventEmitter();
|
||||
(emitter as any).send = vi.fn();
|
||||
return emitter;
|
||||
}
|
||||
|
||||
function sendWorkerReady(worker: EventEmitter, workerId: number) {
|
||||
worker.emit("message", { type: "workerReady", workerId });
|
||||
}
|
||||
|
||||
function createService(numWorkers: number): MasterLobbyService {
|
||||
const config = new TestServerConfig();
|
||||
vi.spyOn(config, "numWorkers").mockReturnValue(numWorkers);
|
||||
const log = { info: vi.fn(), error: vi.fn() } as any;
|
||||
return new MasterLobbyService(config, {} as any, log);
|
||||
}
|
||||
|
||||
function startAllWorkers(
|
||||
service: MasterLobbyService,
|
||||
count: number,
|
||||
): { id: number; w: EventEmitter }[] {
|
||||
const workers = Array.from({ length: count }, (_, i) => {
|
||||
const id = i + 1;
|
||||
const w = createMockWorker();
|
||||
service.registerWorker(id, w as any);
|
||||
return { id, w };
|
||||
});
|
||||
for (const { w, id } of workers) {
|
||||
sendWorkerReady(w, id);
|
||||
}
|
||||
return workers;
|
||||
}
|
||||
|
||||
describe("MasterLobbyService.isHealthy", () => {
|
||||
it("unhealthy before any workers register", () => {
|
||||
const service = createService(4);
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
|
||||
it("unhealthy when workers registered but not ready", () => {
|
||||
const service = createService(2);
|
||||
service.registerWorker(1, createMockWorker() as any);
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
|
||||
it("unhealthy when only some workers are ready (server not started)", () => {
|
||||
const service = createService(4);
|
||||
|
||||
// 1 of 4 ready -- not enough to flip `started`
|
||||
const w1 = createMockWorker();
|
||||
service.registerWorker(1, w1 as any);
|
||||
sendWorkerReady(w1, 1);
|
||||
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
|
||||
it("healthy once all workers are ready", () => {
|
||||
const service = createService(2);
|
||||
startAllWorkers(service, 2);
|
||||
expect(service.isHealthy()).toBe(true);
|
||||
});
|
||||
|
||||
it("stays healthy after a single worker crash", () => {
|
||||
const service = createService(4);
|
||||
startAllWorkers(service, 4);
|
||||
|
||||
service.removeWorker(4); // 3 of 4 left, threshold is 2
|
||||
expect(service.isHealthy()).toBe(true);
|
||||
});
|
||||
|
||||
it("goes unhealthy when too many workers crash", () => {
|
||||
const service = createService(4);
|
||||
startAllWorkers(service, 4);
|
||||
|
||||
service.removeWorker(2);
|
||||
service.removeWorker(3);
|
||||
service.removeWorker(4); // 1 of 4 left, threshold is 2
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
|
||||
it("single-worker setup goes unhealthy on crash", () => {
|
||||
const service = createService(1);
|
||||
startAllWorkers(service, 1);
|
||||
expect(service.isHealthy()).toBe(true);
|
||||
|
||||
service.removeWorker(1);
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
|
||||
it("odd worker count: threshold rounds up (3 workers)", () => {
|
||||
const service = createService(3);
|
||||
startAllWorkers(service, 3);
|
||||
|
||||
// min = 3/2 = 1.5, so 2 ready is enough, 1 is not
|
||||
service.removeWorker(3);
|
||||
expect(service.isHealthy()).toBe(true);
|
||||
|
||||
service.removeWorker(2);
|
||||
expect(service.isHealthy()).toBe(false);
|
||||
});
|
||||
});
|
||||
Reference in New Issue
Block a user