mirror of
https://github.com/openfrontio/OpenFrontIO.git
synced 2026-06-21 09:30:22 +00:00
use otel for observability (#635)
## Description: ## Please complete the following: - [ ] I have added screenshots for all UI updates - [ ] I confirm I have thoroughly tested these changes and take full responsibility for any bugs introduced - [ ] I understand that submitting code with bugs that could have been caught through manual testing blocks releases and new features for all contributors ## Please put your Discord username so you can be contacted if a bug or regression is found: <DISCORD USERNAME> Co-authored-by: evan <openfrontio@gmail.com>
This commit is contained in:
@@ -13,12 +13,6 @@ RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Node Exporter
|
||||
RUN mkdir -p /opt/node_exporter && \
|
||||
wget -qO- https://github.com/prometheus/node_exporter/releases/download/v1.7.0/node_exporter-1.7.0.linux-amd64.tar.gz | \
|
||||
tar xvz --strip-components=1 -C /opt/node_exporter && \
|
||||
ln -s /opt/node_exporter/node_exporter /usr/local/bin/
|
||||
|
||||
# Install cloudflared
|
||||
RUN curl -L https://github.com/cloudflare/cloudflared/releases/latest/download/cloudflared-linux-amd64.deb > cloudflared.deb \
|
||||
&& dpkg -i cloudflared.deb \
|
||||
|
||||
@@ -147,8 +147,9 @@ R2_BUCKET=$R2_BUCKET
|
||||
CF_API_TOKEN=$CF_API_TOKEN
|
||||
DOMAIN=$DOMAIN
|
||||
SUBDOMAIN=$SUBDOMAIN
|
||||
MON_USERNAME=$MON_USERNAME
|
||||
MON_PASSWORD=$MON_PASSWORD
|
||||
OTEL_USERNAME=$OTEL_USERNAME
|
||||
OTEL_PASSWORD=$OTEL_PASSWORD
|
||||
OTEL_ENDPOINT=$OTEL_ENDPOINT
|
||||
EOL
|
||||
chmod 600 $REMOTE_UPDATE_PATH/.env && \
|
||||
$REMOTE_UPDATE_SCRIPT"
|
||||
|
||||
@@ -1,103 +0,0 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Metric Collector for Prometheus Pushgateway
|
||||
# This script collects metrics from Node Exporter and application sources
|
||||
# and pushes them to a Prometheus Pushgateway with custom labels.
|
||||
|
||||
# Configuration
|
||||
NODE_EXPORTER_URL="http://localhost:9100/metrics"
|
||||
APP_METRICS_URL="http://localhost:9090/metrics"
|
||||
PUSHGATEWAY_BASE_URL="https://mon.openfront.io/pushgateway/metrics"
|
||||
AUTH=$MON_USERNAME:$MON_PASSWORD
|
||||
INTERVAL=15 # seconds
|
||||
|
||||
# Function to fetch metrics from Node Exporter
|
||||
fetch_node_exporter_metrics() {
|
||||
curl -s --connect-timeout 5 --max-time 10 "$NODE_EXPORTER_URL" ||
|
||||
echo "# Error fetching Node Exporter metrics"
|
||||
}
|
||||
|
||||
# Function to fetch metrics from your application
|
||||
fetch_app_metrics() {
|
||||
curl -s --connect-timeout 5 --max-time 10 "$APP_METRICS_URL" ||
|
||||
echo "# Error fetching application metrics"
|
||||
}
|
||||
|
||||
# Function to push metrics to Pushgateway
|
||||
push_metrics() {
|
||||
local metrics=$1
|
||||
local job_name=$2
|
||||
|
||||
echo "Pushing $job_name metrics to Pushgateway..."
|
||||
|
||||
# Create a temporary file for the metrics
|
||||
TEMP_FILE=$(mktemp)
|
||||
echo "$metrics" > "$TEMP_FILE"
|
||||
|
||||
# Push to Pushgateway with instance label
|
||||
curl -s -u "$AUTH" --data-binary @"$TEMP_FILE" \
|
||||
"$PUSHGATEWAY_BASE_URL/job/$job_name/instance/$HOSTNAME"
|
||||
|
||||
# Check if push was successful
|
||||
if [ $? -eq 0 ]; then
|
||||
echo "$job_name metrics pushed successfully"
|
||||
else
|
||||
echo "Error pushing $job_name metrics"
|
||||
fi
|
||||
|
||||
# Remove temporary file
|
||||
rm "$TEMP_FILE"
|
||||
}
|
||||
|
||||
# Function to add labels to metrics
|
||||
add_labels() {
|
||||
local metrics=$1
|
||||
|
||||
# First, handle metrics with existing labels
|
||||
metrics=$(echo "$metrics" | sed -E 's/(\{[^}]*)\}/\1,env="'$ENV'",host="'$HOST'",subdomain="'$SUBDOMAIN'"}/g')
|
||||
|
||||
# Then, handle metrics with no existing labels
|
||||
metrics=$(echo "$metrics" | sed -E 's/^([a-zA-Z0-9_:]+)[ \t]+([0-9.e+-]+)$/\1{env="'$ENV'",host="'$HOST'",subdomain="'$SUBDOMAIN'"} \2/g')
|
||||
|
||||
echo "$metrics"
|
||||
}
|
||||
|
||||
# Main function to collect and push metrics
|
||||
collect_and_push_metrics() {
|
||||
echo "Starting metrics collection cycle at $(date)"
|
||||
|
||||
# Get metrics from both sources
|
||||
NODE_METRICS=$(fetch_node_exporter_metrics)
|
||||
APP_METRICS=$(fetch_app_metrics)
|
||||
|
||||
# Clean up metrics (remove headers etc.)
|
||||
NODE_METRICS=$(echo "$NODE_METRICS" | grep -v "^Fetching")
|
||||
APP_METRICS=$(echo "$APP_METRICS" | grep -v "^Fetching")
|
||||
|
||||
# Add labels to metrics
|
||||
NODE_METRICS=$(add_labels "$NODE_METRICS")
|
||||
APP_METRICS=$(add_labels "$APP_METRICS")
|
||||
|
||||
# Push to Pushgateway separately
|
||||
push_metrics "$NODE_METRICS" "node_exporter"
|
||||
push_metrics "$APP_METRICS" "app_metrics"
|
||||
|
||||
echo "Metrics collection cycle completed at $(date)"
|
||||
}
|
||||
|
||||
# Main execution
|
||||
echo "===== Starting metrics collector ====="
|
||||
echo "Environment: $ENV, HOST: $HOST, Subdomain: $SUBDOMAIN"
|
||||
echo "Collecting and pushing metrics every $INTERVAL seconds"
|
||||
echo "Node Exporter URL: $NODE_EXPORTER_URL"
|
||||
echo "App Metrics URL: $APP_METRICS_URL"
|
||||
echo "Pushgateway URL: $PUSHGATEWAY_BASE_URL"
|
||||
|
||||
# Wait for app to be ready.
|
||||
sleep 30
|
||||
|
||||
# Then set up interval loop
|
||||
while true; do
|
||||
sleep $INTERVAL
|
||||
collect_and_push_metrics
|
||||
done
|
||||
Generated
+1570
-13
File diff suppressed because it is too large
Load Diff
+14
-1
@@ -80,6 +80,18 @@
|
||||
"@aws-sdk/client-s3": "^3.758.0",
|
||||
"@datastructures-js/priority-queue": "^6.3.1",
|
||||
"@google-cloud/secret-manager": "^5.6.0",
|
||||
"@opentelemetry/api": "^1.9.0",
|
||||
"@opentelemetry/api-logs": "^0.200.0",
|
||||
"@opentelemetry/auto-instrumentations-node": "^0.58.0",
|
||||
"@opentelemetry/exporter-metrics-otlp-http": "^0.200.0",
|
||||
"@opentelemetry/exporter-trace-otlp-http": "^0.200.0",
|
||||
"@opentelemetry/host-metrics": "^0.36.0",
|
||||
"@opentelemetry/resources": "^2.0.0",
|
||||
"@opentelemetry/sdk-logs": "^0.200.0",
|
||||
"@opentelemetry/sdk-metrics": "^2.0.0",
|
||||
"@opentelemetry/sdk-node": "^0.200.0",
|
||||
"@opentelemetry/semantic-conventions": "^1.32.0",
|
||||
"@opentelemetry/winston-transport": "^0.11.0",
|
||||
"@types/dompurify": "^3.0.5",
|
||||
"@types/express": "^4.17.21",
|
||||
"@types/google-protobuf": "^3.15.12",
|
||||
@@ -95,7 +107,7 @@
|
||||
"d3": "^7.9.0",
|
||||
"discord.js": "^14.16.3",
|
||||
"dompurify": "^3.1.7",
|
||||
"dotenv": "^16.4.7",
|
||||
"dotenv": "^16.5.0",
|
||||
"express": "^4.21.1",
|
||||
"express-rate-limit": "^7.5.0",
|
||||
"google-auth-library": "^9.14.0",
|
||||
@@ -128,6 +140,7 @@
|
||||
"webpack-dev-server": "^5.0.4",
|
||||
"wheelnav": "^1.7.1",
|
||||
"winston": "^3.17.0",
|
||||
"winston-transport": "^4.9.0",
|
||||
"ws": "^8.18.0",
|
||||
"zod": "^3.23.8"
|
||||
},
|
||||
|
||||
@@ -71,20 +71,6 @@ else
|
||||
echo "UDP buffer sizes configured and applied"
|
||||
fi
|
||||
|
||||
# Check if node-exporter container already exists
|
||||
if docker ps -a | grep -q "node-exporter"; then
|
||||
echo "Node Exporter is already installed"
|
||||
else
|
||||
echo "🔄 Installing Node Exporter..."
|
||||
docker run -d --name node-exporter --restart=unless-stopped \
|
||||
--net="host" \
|
||||
--pid="host" \
|
||||
-v "/:/host:ro,rslave" \
|
||||
prom/node-exporter:latest \
|
||||
--path.rootfs=/host
|
||||
echo "Node Exporter installed successfully"
|
||||
fi
|
||||
|
||||
# Set proper ownership for openfront's home directory
|
||||
chown -R openfront:openfront /home/openfront
|
||||
echo "Set proper ownership for openfront's home directory"
|
||||
|
||||
@@ -44,6 +44,10 @@ export interface ServerConfig {
|
||||
r2Endpoint(): string;
|
||||
r2AccessKey(): string;
|
||||
r2SecretKey(): string;
|
||||
otelEndpoint(): string;
|
||||
otelUsername(): string;
|
||||
otelPassword(): string;
|
||||
otelEnabled(): boolean;
|
||||
}
|
||||
|
||||
export interface NukeMagnitude {
|
||||
|
||||
@@ -25,6 +25,20 @@ import { pastelTheme } from "./PastelTheme";
|
||||
import { pastelThemeDark } from "./PastelThemeDark";
|
||||
|
||||
export abstract class DefaultServerConfig implements ServerConfig {
|
||||
otelEnabled(): boolean {
|
||||
return Boolean(
|
||||
this.otelEndpoint() && this.otelUsername() && this.otelPassword(),
|
||||
);
|
||||
}
|
||||
otelEndpoint(): string {
|
||||
return process.env.OTEL_ENDPOINT;
|
||||
}
|
||||
otelUsername(): string {
|
||||
return process.env.OTEL_USERNAME;
|
||||
}
|
||||
otelPassword(): string {
|
||||
return process.env.OTEL_PASSWORD;
|
||||
}
|
||||
region(): string {
|
||||
if (this.env() == GameEnv.Dev) {
|
||||
return "dev";
|
||||
|
||||
+55
-1
@@ -1,4 +1,55 @@
|
||||
import * as logsAPI from "@opentelemetry/api-logs";
|
||||
import { OTLPLogExporter } from "@opentelemetry/exporter-logs-otlp-http";
|
||||
import {
|
||||
LoggerProvider,
|
||||
SimpleLogRecordProcessor,
|
||||
} from "@opentelemetry/sdk-logs";
|
||||
import { OpenTelemetryTransportV3 } from "@opentelemetry/winston-transport";
|
||||
import * as dotenv from "dotenv";
|
||||
import winston from "winston";
|
||||
import { getServerConfigFromServer } from "../core/configuration/ConfigLoader";
|
||||
import { getOtelResource } from "./OtelResource";
|
||||
dotenv.config();
|
||||
|
||||
const config = getServerConfigFromServer();
|
||||
|
||||
const resource = getOtelResource();
|
||||
|
||||
// Initialize the OpenTelemetry Logger Provider
|
||||
const loggerProvider = new LoggerProvider({
|
||||
resource,
|
||||
});
|
||||
|
||||
if (config.otelEnabled()) {
|
||||
console.log("OTEL enabled");
|
||||
// Configure OpenTelemetry endpoint with basic auth (if provided)
|
||||
const headers = {};
|
||||
if (config.otelUsername() && config.otelPassword()) {
|
||||
headers["Authorization"] =
|
||||
"Basic " +
|
||||
Buffer.from(`${config.otelUsername()}:${config.otelPassword()}`).toString(
|
||||
"base64",
|
||||
);
|
||||
}
|
||||
|
||||
// Add OTLP exporter for logs
|
||||
const logExporter = new OTLPLogExporter({
|
||||
url: `${config.otelEndpoint()}/v1/logs`,
|
||||
headers,
|
||||
});
|
||||
|
||||
// Add a log processor with the exporter
|
||||
loggerProvider.addLogRecordProcessor(
|
||||
new SimpleLogRecordProcessor(logExporter),
|
||||
);
|
||||
|
||||
// Set as the global logger provider
|
||||
logsAPI.logs.setGlobalLoggerProvider(loggerProvider);
|
||||
} else {
|
||||
console.log(
|
||||
"No OTLP endpoint and credentials provided, remote logging disabled",
|
||||
);
|
||||
}
|
||||
|
||||
// Custom format to add severity tag based on log level
|
||||
const addSeverityFormat = winston.format((info) => {
|
||||
@@ -20,7 +71,10 @@ const logger = winston.createLogger({
|
||||
service: "openfront",
|
||||
environment: process.env.NODE_ENV,
|
||||
},
|
||||
transports: [new winston.transports.Console()],
|
||||
transports: [
|
||||
new winston.transports.Console(),
|
||||
new OpenTelemetryTransportV3(),
|
||||
],
|
||||
});
|
||||
|
||||
// Export both the main logger and the child logger factory
|
||||
|
||||
@@ -11,7 +11,7 @@ import { generateID } from "../core/Util";
|
||||
import { gatekeeper, LimiterType } from "./Gatekeeper";
|
||||
import { logger } from "./Logger";
|
||||
import { MapPlaylist } from "./MapPlaylist";
|
||||
import { setupMetricsServer } from "./MasterMetrics";
|
||||
import { setupMasterMetrics } from "./MasterMetrics";
|
||||
|
||||
const config = getServerConfigFromServer();
|
||||
const playlist = new MapPlaylist();
|
||||
@@ -20,12 +20,13 @@ const readyWorkers = new Set();
|
||||
const app = express();
|
||||
const server = http.createServer(app);
|
||||
|
||||
// Create a separate metrics server on port 9090
|
||||
const metricsApp = express();
|
||||
const metricsServer = http.createServer(metricsApp);
|
||||
|
||||
const log = logger.child({ comp: "m" });
|
||||
|
||||
if (config.otelEnabled()) {
|
||||
console.log("setting up master metrics");
|
||||
setupMasterMetrics();
|
||||
}
|
||||
|
||||
const __filename = fileURLToPath(import.meta.url);
|
||||
const __dirname = path.dirname(__filename);
|
||||
app.use(express.json());
|
||||
@@ -146,9 +147,6 @@ export async function startMaster() {
|
||||
server.listen(PORT, () => {
|
||||
log.info(`Master HTTP server listening on port ${PORT}`);
|
||||
});
|
||||
|
||||
// Setup the metrics server
|
||||
setupMetricsServer();
|
||||
}
|
||||
|
||||
app.get(
|
||||
|
||||
+61
-181
@@ -1,189 +1,69 @@
|
||||
import express from "express";
|
||||
import http from "http";
|
||||
import promClient from "prom-client";
|
||||
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http";
|
||||
import { HostMetrics } from "@opentelemetry/host-metrics";
|
||||
import {
|
||||
MeterProvider,
|
||||
PeriodicExportingMetricReader,
|
||||
} from "@opentelemetry/sdk-metrics";
|
||||
import * as dotenv from "dotenv";
|
||||
import { getServerConfigFromServer } from "../core/configuration/ConfigLoader";
|
||||
import { getOtelResource } from "./OtelResource";
|
||||
|
||||
dotenv.config();
|
||||
|
||||
// Get server configuration
|
||||
const config = getServerConfigFromServer();
|
||||
|
||||
// Create a separate metrics server on port 9090
|
||||
const metricsApp = express();
|
||||
const metricsServer = http.createServer(metricsApp);
|
||||
// Create resource with master information
|
||||
const resource = getOtelResource();
|
||||
|
||||
// Initialize the Prometheus registry for the master's own metrics
|
||||
const register = new promClient.Registry();
|
||||
|
||||
// Default Prometheus metrics
|
||||
promClient.collectDefaultMetrics({ register });
|
||||
|
||||
// Prometheus metrics endpoint that gathers metrics from workers
|
||||
export function setupMetricsServer() {
|
||||
metricsApp.get("/metrics", async (req, res) => {
|
||||
// Set a timeout for the request to avoid hanging
|
||||
const timeout = setTimeout(() => {
|
||||
res.status(500).end("# Error: Request timed out after 30 seconds");
|
||||
}, 30000);
|
||||
console.log("Metrics requested");
|
||||
try {
|
||||
// Get the master's metrics
|
||||
const masterMetrics = await register.metrics();
|
||||
|
||||
// Track seen metric names to avoid duplicate metadata
|
||||
const seenMetrics = new Set();
|
||||
const processedLines = [];
|
||||
const allMetricValues = [];
|
||||
|
||||
// Process all metadata information in the master metrics first
|
||||
const masterLines = masterMetrics.split("\n");
|
||||
|
||||
for (let j = 0; j < masterLines.length; j++) {
|
||||
const line = masterLines[j];
|
||||
|
||||
if (line.startsWith("# HELP ")) {
|
||||
const metricName = line.split(" ")[2];
|
||||
seenMetrics.add(metricName);
|
||||
processedLines.push(line);
|
||||
} else if (line.startsWith("# TYPE ")) {
|
||||
const metricName = line.split(" ")[2];
|
||||
if (seenMetrics.has(metricName)) {
|
||||
processedLines.push(line);
|
||||
}
|
||||
} else if (line.trim() && !line.startsWith("#")) {
|
||||
// Add worker label to each metric line and collect for later
|
||||
const processedLine = line.replace(
|
||||
/^([a-z][a-z0-9_]*)(?:{([^}]*)})?(\s+[0-9.e+-]+.*)/,
|
||||
(match, metricName, existingLabels, valueAndRest) => {
|
||||
if (existingLabels) {
|
||||
return `${metricName}{${existingLabels},worker="master"}${valueAndRest}`;
|
||||
} else {
|
||||
return `${metricName}{worker="master"}${valueAndRest}`;
|
||||
}
|
||||
},
|
||||
);
|
||||
allMetricValues.push(processedLine);
|
||||
}
|
||||
}
|
||||
|
||||
// Collect metrics from all workers
|
||||
for (let i = 0; i < config.numWorkers(); i++) {
|
||||
const workerPort = config.workerPortByIndex(i);
|
||||
const workerUrl = `http://localhost:${workerPort}/metrics`;
|
||||
console.log(`Fetching metrics from worker ${i} at ${workerUrl}`);
|
||||
|
||||
try {
|
||||
const response = await fetch(workerUrl, {
|
||||
headers: {
|
||||
[config.adminHeader()]: config.adminToken(),
|
||||
},
|
||||
});
|
||||
|
||||
if (!response.ok) {
|
||||
console.error(`Worker ${i} returned status ${response.status}`);
|
||||
continue;
|
||||
}
|
||||
|
||||
const metricsText = await response.text();
|
||||
const lines = metricsText.split("\n");
|
||||
|
||||
for (let j = 0; j < lines.length; j++) {
|
||||
const line = lines[j];
|
||||
|
||||
// Collect HELP and TYPE info if we haven't seen this metric before
|
||||
if (line.startsWith("# HELP ")) {
|
||||
const metricName = line.split(" ")[2];
|
||||
if (!seenMetrics.has(metricName)) {
|
||||
seenMetrics.add(metricName);
|
||||
processedLines.push(line);
|
||||
}
|
||||
} else if (line.startsWith("# TYPE ")) {
|
||||
const metricName = line.split(" ")[2];
|
||||
if (
|
||||
seenMetrics.has(metricName) &&
|
||||
!processedLines.some((l) =>
|
||||
l.startsWith(`# TYPE ${metricName}`),
|
||||
)
|
||||
) {
|
||||
processedLines.push(line);
|
||||
}
|
||||
} else if (line.trim() && !line.startsWith("#")) {
|
||||
// Process and collect actual metric values
|
||||
try {
|
||||
const processedLine = line.replace(
|
||||
/^([a-z][a-z0-9_]*)(?:{([^}]*)})?(\s+[0-9.e+-]+.*)/,
|
||||
(match, metricName, existingLabels, valueAndRest) => {
|
||||
if (existingLabels) {
|
||||
return `${metricName}{${existingLabels},worker="worker-${i}"}${valueAndRest}`;
|
||||
} else {
|
||||
return `${metricName}{worker="worker-${i}"}${valueAndRest}`;
|
||||
}
|
||||
},
|
||||
);
|
||||
|
||||
// Make sure the line was actually processed (regex matched)
|
||||
if (processedLine !== line) {
|
||||
allMetricValues.push(processedLine);
|
||||
} else if (
|
||||
line.match(/^[a-z][a-z0-9_]*(?:{[^}]*})?\s+[0-9.e+-]+.*/)
|
||||
) {
|
||||
// This looks like a metric line but didn't match our regex, try a more general approach
|
||||
const parts = line.split(/({|\s+)/);
|
||||
if (parts.length >= 3) {
|
||||
const metricName = parts[0];
|
||||
if (line.includes("{")) {
|
||||
// Has labels
|
||||
const labelEndIndex = line.indexOf("}");
|
||||
const valueStartIndex = labelEndIndex + 1;
|
||||
if (labelEndIndex > 0 && valueStartIndex < line.length) {
|
||||
const labels = line.substring(
|
||||
line.indexOf("{") + 1,
|
||||
labelEndIndex,
|
||||
);
|
||||
const valueAndRest = line.substring(valueStartIndex);
|
||||
allMetricValues.push(
|
||||
`${metricName}{${labels},worker="worker-${i}"}${valueAndRest}`,
|
||||
);
|
||||
}
|
||||
} else {
|
||||
// No labels
|
||||
const valueAndRest = line.substring(metricName.length);
|
||||
allMetricValues.push(
|
||||
`${metricName}{worker="worker-${i}"}${valueAndRest}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error processing metric line: ${line}`, error);
|
||||
// Skip this line if there's an error
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.error(`Error fetching metrics from worker ${i}:`, error);
|
||||
allMetricValues.push(
|
||||
`# Error fetching metrics from worker ${i}: ${error.message}`,
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Combine metadata with all metric values and ensure it ends with a newline
|
||||
const combinedMetrics = [...processedLines, ...allMetricValues].join(
|
||||
"\n",
|
||||
// Configure headers for basic auth if provided
|
||||
const getAuthHeaders = () => {
|
||||
const headers = {};
|
||||
if (config.otelEnabled()) {
|
||||
headers["Authorization"] =
|
||||
"Basic " +
|
||||
Buffer.from(`${config.otelUsername()}:${config.otelPassword()}`).toString(
|
||||
"base64",
|
||||
);
|
||||
}
|
||||
return headers;
|
||||
};
|
||||
|
||||
// Send the combined response with a final newline to prevent unexpected end of input
|
||||
clearTimeout(timeout);
|
||||
res.set("Content-Type", register.contentType);
|
||||
res.end(combinedMetrics + "\n");
|
||||
} catch (error) {
|
||||
console.error("Error collecting metrics:", error);
|
||||
clearTimeout(timeout);
|
||||
res.status(500).end(`# Error collecting metrics: ${error.message}`);
|
||||
}
|
||||
});
|
||||
// Create metrics exporter
|
||||
const metricExporter = new OTLPMetricExporter({
|
||||
// Dummy endpoint if OTEL is not enabled to avoid parsing errors
|
||||
url: `${config.otelEndpoint() || "https://dummy_endpoint.com"}/v1/metrics`,
|
||||
headers: getAuthHeaders(),
|
||||
});
|
||||
|
||||
// Start the metrics server on port 9090
|
||||
const METRICS_PORT = 9090;
|
||||
metricsServer.listen(METRICS_PORT, () => {
|
||||
console.log(`Metrics server listening on port ${METRICS_PORT}`);
|
||||
});
|
||||
}
|
||||
// Configure the metric reader
|
||||
const metricReader = new PeriodicExportingMetricReader({
|
||||
exporter: metricExporter,
|
||||
exportIntervalMillis: 15000, // Export metrics every 15 seconds
|
||||
});
|
||||
|
||||
// Create a meter provider
|
||||
const meterProvider = new MeterProvider({
|
||||
resource,
|
||||
readers: [metricReader],
|
||||
});
|
||||
|
||||
// Setup host metrics
|
||||
const hostMetrics = new HostMetrics({ meterProvider });
|
||||
|
||||
// Get meter for creating custom metrics
|
||||
const meter = meterProvider.getMeter("master-metrics");
|
||||
|
||||
// Export the metrics for use in the master
|
||||
export const setupMasterMetrics = () => {
|
||||
console.log("Starting host metrics collection for master...");
|
||||
|
||||
// Start collecting host metrics
|
||||
hostMetrics.start();
|
||||
|
||||
// Return the meter provider and meter for potential additional metrics
|
||||
return {
|
||||
meterProvider,
|
||||
meter,
|
||||
};
|
||||
};
|
||||
|
||||
@@ -0,0 +1,27 @@
|
||||
import { resourceFromAttributes } from "@opentelemetry/resources";
|
||||
import {
|
||||
ATTR_SERVICE_NAME,
|
||||
ATTR_SERVICE_VERSION,
|
||||
} from "@opentelemetry/semantic-conventions";
|
||||
import { getServerConfigFromServer } from "../core/configuration/ConfigLoader";
|
||||
|
||||
const config = getServerConfigFromServer();
|
||||
|
||||
export function getOtelResource() {
|
||||
return resourceFromAttributes({
|
||||
[ATTR_SERVICE_NAME]: "openfront",
|
||||
[ATTR_SERVICE_VERSION]: "1.0.0",
|
||||
"service.instance.id": process.env.HOSTNAME,
|
||||
"openfront.environment": config.env(),
|
||||
"openfront.host": process.env.HOST,
|
||||
"openfront.domain": process.env.DOMAIN,
|
||||
"openfront.subdomain": process.env.SUBDOMAIN,
|
||||
"openfront.component": process.env.WORKER_ID
|
||||
? "Worker " + process.env.WORKER_ID
|
||||
: "Master",
|
||||
// The comma-separated list tells OpenTelemetry which resource attributes
|
||||
// should be converted to Loki labels
|
||||
"loki.resource.labels":
|
||||
"service.name,service.instance.id,openfront.environment,openfront.host,openfront.domain,openfront.subdomain,openfront.component",
|
||||
});
|
||||
}
|
||||
+6
-22
@@ -33,10 +33,12 @@ export function startWorker() {
|
||||
|
||||
const gm = new GameManager(config, log);
|
||||
|
||||
// Set up periodic metrics updates
|
||||
setInterval(() => {
|
||||
metrics.updateGameMetrics(gm);
|
||||
}, 15000); // Update every 15 seconds
|
||||
if (config.otelEnabled()) {
|
||||
// Set up periodic metrics updates
|
||||
setInterval(() => {
|
||||
metrics.updateGameMetrics(gm);
|
||||
}, 15000); // Update every 15 seconds
|
||||
}
|
||||
|
||||
// Middleware to handle /wX path prefix
|
||||
app.use((req, res, next) => {
|
||||
@@ -251,24 +253,6 @@ export function startWorker() {
|
||||
}),
|
||||
);
|
||||
|
||||
app.get(
|
||||
"/metrics",
|
||||
gatekeeper.httpHandler(LimiterType.Get, async (req, res) => {
|
||||
if (req.headers[config.adminHeader()] !== config.adminToken()) {
|
||||
return res.status(403).end("Access denied");
|
||||
}
|
||||
log.info(`metrics requested on worker ${workerId}`);
|
||||
|
||||
try {
|
||||
const metricsData = await metrics.register.metrics();
|
||||
res.set("Content-Type", metrics.register.contentType);
|
||||
res.end(metricsData);
|
||||
} catch (error) {
|
||||
res.status(500).end(error.message);
|
||||
}
|
||||
}),
|
||||
);
|
||||
|
||||
// WebSocket handling
|
||||
wss.on("connection", (ws: WebSocket, req) => {
|
||||
ws.on(
|
||||
|
||||
+86
-28
@@ -1,45 +1,103 @@
|
||||
import promClient from "prom-client";
|
||||
import { OTLPMetricExporter } from "@opentelemetry/exporter-metrics-otlp-http";
|
||||
import {
|
||||
MeterProvider,
|
||||
PeriodicExportingMetricReader,
|
||||
} from "@opentelemetry/sdk-metrics";
|
||||
import * as dotenv from "dotenv";
|
||||
import { getServerConfigFromServer } from "../core/configuration/ConfigLoader";
|
||||
import { GameManager } from "./GameManager";
|
||||
import { getOtelResource } from "./OtelResource";
|
||||
|
||||
// Initialize the Prometheus registry
|
||||
const register = new promClient.Registry();
|
||||
dotenv.config();
|
||||
|
||||
// Enable default Node.js metrics collection
|
||||
promClient.collectDefaultMetrics({ register });
|
||||
// Get server configuration
|
||||
const config = getServerConfigFromServer();
|
||||
|
||||
// Add worker-specific metrics
|
||||
const activeGamesGauge = new promClient.Gauge({
|
||||
name: "openfront_active_games_count",
|
||||
help: "Number of active games on this worker",
|
||||
registers: [register],
|
||||
// Create resource with worker information
|
||||
const resource = getOtelResource();
|
||||
|
||||
// Configure headers for basic auth if provided
|
||||
const getAuthHeaders = () => {
|
||||
const headers = {};
|
||||
if (config.otelEnabled()) {
|
||||
headers["Authorization"] =
|
||||
"Basic " +
|
||||
Buffer.from(`${config.otelUsername()}:${config.otelPassword()}`).toString(
|
||||
"base64",
|
||||
);
|
||||
}
|
||||
return headers;
|
||||
};
|
||||
|
||||
// Create metrics exporter
|
||||
const metricExporter = new OTLPMetricExporter({
|
||||
// Dummy endpoint if OTEL is not enabled to avoid parsing errors
|
||||
url: `${config.otelEndpoint() || "https://dummy_endpoint.com"}/v1/metrics`,
|
||||
headers: getAuthHeaders(),
|
||||
});
|
||||
|
||||
const connectedClientsGauge = new promClient.Gauge({
|
||||
name: "openfront_connected_clients_count",
|
||||
help: "Number of connected clients on this worker",
|
||||
registers: [register],
|
||||
// Configure the metric reader
|
||||
const metricReader = new PeriodicExportingMetricReader({
|
||||
exporter: metricExporter,
|
||||
exportIntervalMillis: 15000, // Export metrics every 15 seconds
|
||||
});
|
||||
|
||||
const memoryUsageGauge = new promClient.Gauge({
|
||||
name: "openfront_memory_usage_bytes",
|
||||
help: "Current memory usage of the worker process in bytes",
|
||||
registers: [register],
|
||||
// Create a meter provider
|
||||
const meterProvider = new MeterProvider({
|
||||
resource,
|
||||
readers: [metricReader],
|
||||
});
|
||||
|
||||
// Get meter for creating metrics
|
||||
const meter = meterProvider.getMeter("worker-metrics");
|
||||
|
||||
// Create OpenTelemetry metrics
|
||||
const activeGamesCounter = meter.createUpDownCounter(
|
||||
"openfront.active_games.count",
|
||||
{
|
||||
description: "Number of active games on this worker",
|
||||
},
|
||||
);
|
||||
|
||||
const connectedClientsCounter = meter.createUpDownCounter(
|
||||
"openfront.connected_clients.count",
|
||||
{
|
||||
description: "Number of connected clients on this worker",
|
||||
},
|
||||
);
|
||||
|
||||
const memoryUsageObservable = meter.createObservableGauge(
|
||||
"openfront.memory_usage.bytes",
|
||||
{
|
||||
description: "Current memory usage of the worker process in bytes",
|
||||
},
|
||||
);
|
||||
|
||||
// Register callback for the memory usage observable
|
||||
memoryUsageObservable.addCallback((result) => {
|
||||
const memoryUsage = process.memoryUsage();
|
||||
result.observe(memoryUsage.heapUsed);
|
||||
});
|
||||
|
||||
// Export the metrics for use in the worker
|
||||
export const metrics = {
|
||||
register,
|
||||
activeGamesGauge,
|
||||
connectedClientsGauge,
|
||||
memoryUsageGauge,
|
||||
|
||||
// Function to update game-related metrics
|
||||
updateGameMetrics: (gameManager: GameManager) => {
|
||||
activeGamesGauge.set(gameManager.activeGames());
|
||||
connectedClientsGauge.set(gameManager.activeClients());
|
||||
console.log("Updating game metrics");
|
||||
// Get the current counts
|
||||
const currentActiveGames = gameManager.activeGames();
|
||||
const currentActiveClients = gameManager.activeClients();
|
||||
|
||||
// Update memory usage metrics
|
||||
const memoryUsage = process.memoryUsage();
|
||||
memoryUsageGauge.set(memoryUsage.heapUsed);
|
||||
// Set the absolute values (createUpDownCounter allows setting absolute values)
|
||||
activeGamesCounter.add(currentActiveGames);
|
||||
connectedClientsCounter.add(currentActiveClients);
|
||||
|
||||
// Memory metrics are automatically collected by the observable
|
||||
},
|
||||
|
||||
// Expose the meter provider for potential additional metrics
|
||||
meterProvider,
|
||||
|
||||
// Expose the meter for creating additional metrics
|
||||
meter,
|
||||
};
|
||||
|
||||
+1
-16
@@ -29,19 +29,4 @@ command=cloudflared tunnel run --token %(ENV_CLOUDFLARE_TUNNEL_TOKEN)s
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/var/log/cloudflared.log
|
||||
stderr_logfile=/var/log/cloudflared-err.log
|
||||
|
||||
[program:node_exporter]
|
||||
command=/usr/local/bin/node_exporter
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/var/log/node_exporter.log
|
||||
stderr_logfile=/var/log/node_exporter-err.log
|
||||
|
||||
|
||||
[program:metrics_exporter]
|
||||
command=/usr/src/app/metric-exporter.sh
|
||||
autostart=true
|
||||
autorestart=true
|
||||
stdout_logfile=/var/log/metrics-exporter.log
|
||||
stderr_logfile=/var/log/metrics-exporter-err.log
|
||||
stderr_logfile=/var/log/cloudflared-err.log
|
||||
@@ -3,6 +3,18 @@ import { GameMapType } from "../../src/core/game/Game";
|
||||
import { GameID } from "../../src/core/Schemas";
|
||||
|
||||
export class TestServerConfig implements ServerConfig {
|
||||
otelEnabled(): boolean {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
otelEndpoint(): string {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
otelUsername(): string {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
otelPassword(): string {
|
||||
throw new Error("Method not implemented.");
|
||||
}
|
||||
region(): string {
|
||||
return "test";
|
||||
}
|
||||
|
||||
@@ -16,19 +16,6 @@ echo "======================================================"
|
||||
# Container and image configuration
|
||||
CONTAINER_NAME="openfront-${ENV}-${SUBDOMAIN}"
|
||||
|
||||
# Install Loki Docker plugin if not already installed
|
||||
if ! docker plugin ls | grep -q "loki"; then
|
||||
echo "Installing Loki Docker plugin..."
|
||||
docker plugin install grafana/loki-docker-driver:latest --alias loki --grant-all-permissions
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "Failed to install Loki Docker plugin. Continuing anyway..."
|
||||
else
|
||||
echo "Loki Docker plugin installed successfully."
|
||||
fi
|
||||
else
|
||||
echo "Loki Docker plugin already installed."
|
||||
fi
|
||||
|
||||
echo "Pulling ${DOCKER_IMAGE} from Docker Hub..."
|
||||
docker pull $DOCKER_IMAGE
|
||||
|
||||
@@ -52,31 +39,10 @@ if [ -n "$STOPPED_CONTAINER" ]; then
|
||||
echo "Container $STOPPED_CONTAINER removed."
|
||||
fi
|
||||
|
||||
# Check if port 80 is still in use
|
||||
echo "Checking if port 80 is still in use..."
|
||||
if command -v lsof >/dev/null 2>&1; then
|
||||
PORT_CHECK=$(lsof -i :80 | grep LISTEN)
|
||||
elif command -v netstat >/dev/null 2>&1; then
|
||||
PORT_CHECK=$(netstat -tuln | grep ":80 ")
|
||||
else
|
||||
PORT_CHECK=""
|
||||
echo "Warning: Cannot check if port is in use (neither lsof nor netstat found)"
|
||||
fi
|
||||
|
||||
if [ -n "$PORT_CHECK" ]; then
|
||||
echo "Warning: Port 80 is still in use by another process:"
|
||||
echo "$PORT_CHECK"
|
||||
echo "Attempting to proceed anyway..."
|
||||
fi
|
||||
|
||||
echo "Starting new container for ${HOST} environment..."
|
||||
docker run -d \
|
||||
--restart=always \
|
||||
$VOLUME_MOUNTS \
|
||||
--log-driver=loki \
|
||||
--log-opt loki-url="https://${MON_USERNAME}:${MON_PASSWORD}@mon.openfront.io/loki/loki/api/v1/push" \
|
||||
--log-opt loki-batch-size="400" \
|
||||
--log-opt loki-external-labels="job=docker,environment=${ENV},host=${HOST},subdomain=${SUBDOMAIN}" \
|
||||
--env-file /home/openfront/.env \
|
||||
--name ${CONTAINER_NAME} \
|
||||
$DOCKER_IMAGE
|
||||
@@ -99,5 +65,4 @@ echo "======================================================"
|
||||
echo "✅ SERVER UPDATE COMPLETED SUCCESSFULLY"
|
||||
echo "Container name: ${CONTAINER_NAME}"
|
||||
echo "Image: ${FULL_IMAGE_NAME}"
|
||||
echo "Logs: Configured to send to Loki on port 3100"
|
||||
echo "======================================================"
|
||||
Reference in New Issue
Block a user