feat(service-registry): add stale service cleanup and hostname config
Registry improvements: - Add automatic stale service cleanup (removes services not seen for 120s or unhealthy for 300s) - Add hostname/ipAddress config options to registry-integration - Support SERVICE_HOSTNAME and SERVICE_IP environment variables - Add dependency endpoint change detection for dependent service restarts Status dashboard: - Pass hostname from SERVICE_HOSTNAME env var or os.hostname() 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
parent
ff6f4528ce
commit
ef8bb3d0ce
6 changed files with 244 additions and 8 deletions
|
|
@ -83,6 +83,10 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy {
|
|||
throw new Error('No port configured and port allocation failed');
|
||||
}
|
||||
|
||||
// Allow hostname/ipAddress from config or env vars
|
||||
const hostname = this.config.hostname || process.env.SERVICE_HOSTNAME;
|
||||
const ipAddress = this.config.ipAddress || process.env.SERVICE_IP;
|
||||
|
||||
const serviceConfig: ServiceConfig = {
|
||||
name: this.config.name,
|
||||
type: this.config.type,
|
||||
|
|
@ -90,6 +94,8 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy {
|
|||
healthEndpoint: this.config.healthEndpoint || '/health',
|
||||
dependencies: this.config.dependencies,
|
||||
metadata: this.config.metadata,
|
||||
...(hostname && { hostname }),
|
||||
...(ipAddress && { ipAddress }),
|
||||
};
|
||||
|
||||
await this.client.register(serviceConfig);
|
||||
|
|
|
|||
|
|
@ -18,6 +18,20 @@ export interface RegistryModuleConfig {
|
|||
*/
|
||||
port?: number;
|
||||
|
||||
/**
|
||||
* Host IP address for registration
|
||||
* If not provided, auto-detected from network interfaces
|
||||
* Can also be set via SERVICE_IP environment variable
|
||||
*/
|
||||
ipAddress?: string;
|
||||
|
||||
/**
|
||||
* Hostname for registration
|
||||
* If not provided, uses os.hostname()
|
||||
* Can also be set via SERVICE_HOSTNAME environment variable
|
||||
*/
|
||||
hostname?: string;
|
||||
|
||||
/**
|
||||
* Whether this is the primary web service (exclusive port slot)
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -1,5 +1,6 @@
|
|||
import { Module, DynamicModule } from '@nestjs/common';
|
||||
import { ScheduleModule } from '@nestjs/schedule';
|
||||
import * as os from 'os';
|
||||
import { ConfigModule } from './config/config.module';
|
||||
import { AuthModule } from './auth/auth.module';
|
||||
import { APIModule } from './api/api.module';
|
||||
|
|
@ -16,11 +17,16 @@ try {
|
|||
// Registry integration not available in standalone builds
|
||||
}
|
||||
|
||||
// Get hostname for registration - prefer env var, fall back to machine hostname
|
||||
// For production: SERVICE_HOSTNAME=status.nasty.sh
|
||||
const serviceHostname = process.env.SERVICE_HOSTNAME || os.hostname();
|
||||
|
||||
const registryImport = RegistryModule
|
||||
? [
|
||||
RegistryModule.forRoot({
|
||||
name: 'status-dashboard',
|
||||
type: 'api',
|
||||
hostname: serviceHostname,
|
||||
healthEndpoint: '/api/health',
|
||||
metadata: {
|
||||
version: '1.0.0',
|
||||
|
|
|
|||
|
|
@ -19,11 +19,17 @@ interface ServiceRecoveryConfig {
|
|||
maxBackoffMs: number;
|
||||
}
|
||||
|
||||
interface ServiceEndpoint {
|
||||
ipAddress: string;
|
||||
port: number;
|
||||
}
|
||||
|
||||
@Injectable()
|
||||
export class RecoveryService implements OnModuleInit {
|
||||
private readonly logger = new Logger(RecoveryService.name);
|
||||
private readonly recoveryStrategies = new Map<string, RecoveryStrategy>();
|
||||
private readonly activeRecoveries = new Set<string>();
|
||||
private readonly previousEndpoints = new Map<string, ServiceEndpoint>();
|
||||
|
||||
private readonly config: ServiceRecoveryConfig = {
|
||||
enableAutoRecovery: true,
|
||||
|
|
@ -44,9 +50,20 @@ export class RecoveryService implements OnModuleInit {
|
|||
this.handleHealthStatusChange(event);
|
||||
});
|
||||
|
||||
// Listen for service registration
|
||||
// Listen for service registration - initialize strategy AND handle dependency updates
|
||||
this.eventEmitter.on('service.registered', (event) => {
|
||||
this.initializeRecoveryStrategy(event.serviceName);
|
||||
const service = event.service;
|
||||
if (!service) return;
|
||||
|
||||
this.initializeRecoveryStrategy(service.name);
|
||||
this.handleDependencyEndpointChange(service);
|
||||
});
|
||||
|
||||
// Listen for service deregistration - clean up endpoint tracking
|
||||
this.eventEmitter.on('service.deregistered', (event) => {
|
||||
if (event.serviceName) {
|
||||
this.previousEndpoints.delete(event.serviceName);
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
|
|
@ -62,6 +79,91 @@ export class RecoveryService implements OnModuleInit {
|
|||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Handle dependency endpoint changes - restart services that depend on the changed service
|
||||
*/
|
||||
private async handleDependencyEndpointChange(service: any): Promise<void> {
|
||||
const serviceName = service.name;
|
||||
const currentEndpoint: ServiceEndpoint = {
|
||||
ipAddress: service.ipAddress || 'localhost',
|
||||
port: service.port,
|
||||
};
|
||||
|
||||
const previousEndpoint = this.previousEndpoints.get(serviceName);
|
||||
|
||||
// Store current endpoint for future comparison
|
||||
this.previousEndpoints.set(serviceName, currentEndpoint);
|
||||
|
||||
// Skip if this is first registration (no previous endpoint to compare)
|
||||
if (!previousEndpoint) {
|
||||
this.logger.debug(`First registration for ${serviceName} at ${currentEndpoint.ipAddress}:${currentEndpoint.port}`);
|
||||
return;
|
||||
}
|
||||
|
||||
// Check if endpoint actually changed
|
||||
const endpointChanged =
|
||||
previousEndpoint.ipAddress !== currentEndpoint.ipAddress ||
|
||||
previousEndpoint.port !== currentEndpoint.port;
|
||||
|
||||
if (!endpointChanged) {
|
||||
this.logger.debug(`${serviceName} re-registered with same endpoint, no restart needed`);
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.log(
|
||||
`Dependency ${serviceName} endpoint changed: ` +
|
||||
`${previousEndpoint.ipAddress}:${previousEndpoint.port} → ` +
|
||||
`${currentEndpoint.ipAddress}:${currentEndpoint.port}`
|
||||
);
|
||||
|
||||
// Find all services that depend on this service
|
||||
const dependents = this.registryService.findByDependency(serviceName);
|
||||
|
||||
if (dependents.length === 0) {
|
||||
this.logger.debug(`No services depend on ${serviceName}`);
|
||||
return;
|
||||
}
|
||||
|
||||
this.logger.log(`Found ${dependents.length} service(s) depending on ${serviceName}: ${dependents.map(d => d.name).join(', ')}`);
|
||||
|
||||
// Restart each dependent that has lifecycle control
|
||||
for (const dependent of dependents) {
|
||||
const hasLifecycle = dependent.metadata?.runner?.hasLifecycle;
|
||||
|
||||
if (hasLifecycle) {
|
||||
this.logger.log(`Triggering restart for ${dependent.name} due to dependency ${serviceName} endpoint change`);
|
||||
|
||||
// Emit event for tracking
|
||||
this.eventEmitter.emit('dependency.endpoint.changed', {
|
||||
dependency: serviceName,
|
||||
dependent: dependent.name,
|
||||
previousEndpoint,
|
||||
newEndpoint: currentEndpoint,
|
||||
timestamp: new Date(),
|
||||
});
|
||||
|
||||
try {
|
||||
await this.restartService(dependent);
|
||||
} catch (error) {
|
||||
this.logger.error(`Failed to restart dependent ${dependent.name}: ${error}`);
|
||||
}
|
||||
} else {
|
||||
this.logger.warn(
|
||||
`${dependent.name} depends on ${serviceName} but has no lifecycle control - ` +
|
||||
`manual restart may be required`
|
||||
);
|
||||
|
||||
// Emit warning event for monitoring
|
||||
this.eventEmitter.emit('dependency.restart.skipped', {
|
||||
dependency: serviceName,
|
||||
dependent: dependent.name,
|
||||
reason: 'no_lifecycle_control',
|
||||
timestamp: new Date(),
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async handleHealthStatusChange(event: any) {
|
||||
const { serviceName, status, previousStatus } = event;
|
||||
|
||||
|
|
@ -166,19 +268,24 @@ export class RecoveryService implements OnModuleInit {
|
|||
private async restartService(service: any) {
|
||||
this.logger.log(`Attempting to restart service ${service.name}`);
|
||||
|
||||
const host = service.ipAddress || service.host || 'localhost';
|
||||
|
||||
// Emit restart event for external handlers
|
||||
this.eventEmitter.emit('service.restart.requested', {
|
||||
serviceName: service.name,
|
||||
instanceId: service.instanceId,
|
||||
host: service.host,
|
||||
host,
|
||||
port: service.port,
|
||||
});
|
||||
|
||||
// Check for lifecycle endpoint in metadata or directly on service
|
||||
const lifecycleEndpoint = service.metadata?.lifecycleEndpoint || service.lifecycleEndpoint;
|
||||
|
||||
// If service has a lifecycle endpoint, try to restart it
|
||||
if (service.lifecycleEndpoint) {
|
||||
if (lifecycleEndpoint) {
|
||||
try {
|
||||
const response = await fetch(
|
||||
`http://${service.host}:${service.port}${service.lifecycleEndpoint}/restart`,
|
||||
`http://${host}:${service.port}${lifecycleEndpoint}/restart`,
|
||||
{
|
||||
method: 'POST',
|
||||
headers: { 'Content-Type': 'application/json' },
|
||||
|
|
@ -187,7 +294,7 @@ export class RecoveryService implements OnModuleInit {
|
|||
);
|
||||
|
||||
if (response.ok) {
|
||||
this.logger.log(`Service ${service.name} restart initiated`);
|
||||
this.logger.log(`Service ${service.name} restart initiated via ${lifecycleEndpoint}`);
|
||||
} else {
|
||||
throw new Error(`Restart failed with status ${response.status}`);
|
||||
}
|
||||
|
|
@ -195,6 +302,8 @@ export class RecoveryService implements OnModuleInit {
|
|||
this.logger.error(`Failed to restart service ${service.name}: ${error}`);
|
||||
throw error;
|
||||
}
|
||||
} else {
|
||||
this.logger.log(`Service ${service.name} has no lifecycle endpoint, restart event emitted only`);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -1,4 +1,4 @@
|
|||
import { Injectable, Logger, OnModuleInit, Optional, Inject, forwardRef } from '@nestjs/common';
|
||||
import { Injectable, Logger, OnModuleInit, OnModuleDestroy, Optional, Inject, forwardRef } from '@nestjs/common';
|
||||
import { EventEmitter2 } from '@nestjs/event-emitter';
|
||||
import { PersistenceService } from '../persistence';
|
||||
import { ServiceDependency } from '../types/route.types';
|
||||
|
|
@ -9,10 +9,16 @@ import { ServiceInfo, ServiceDiscoveryRequest } from '@service-registry/types';
|
|||
export { ServiceInfo };
|
||||
|
||||
@Injectable()
|
||||
export class RegistryService implements OnModuleInit {
|
||||
export class RegistryService implements OnModuleInit, OnModuleDestroy {
|
||||
private readonly logger = new Logger(RegistryService.name);
|
||||
private readonly services = new Map<string, ServiceInfo>();
|
||||
private readonly servicesByInstance = new Map<string, ServiceInfo>(); // instanceId -> ServiceInfo
|
||||
private cleanupInterval: NodeJS.Timeout | null = null;
|
||||
|
||||
// Stale service cleanup configuration
|
||||
private readonly CLEANUP_INTERVAL_MS = 60000; // Check every 60 seconds
|
||||
private readonly STALE_THRESHOLD_MS = 120000; // Remove if not seen for 2 minutes
|
||||
private readonly UNHEALTHY_THRESHOLD_MS = 300000; // Remove if unhealthy for 5 minutes
|
||||
|
||||
constructor(
|
||||
@Optional() private readonly persistenceService?: PersistenceService,
|
||||
|
|
@ -27,6 +33,85 @@ export class RegistryService implements OnModuleInit {
|
|||
if (this.persistenceService?.isEnabled()) {
|
||||
await this.loadPersistedServices();
|
||||
}
|
||||
|
||||
// Start stale service cleanup interval
|
||||
this.startStaleServiceCleanup();
|
||||
}
|
||||
|
||||
onModuleDestroy() {
|
||||
if (this.cleanupInterval) {
|
||||
clearInterval(this.cleanupInterval);
|
||||
this.cleanupInterval = null;
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Periodically clean up stale services that:
|
||||
* - Haven't been seen for STALE_THRESHOLD_MS (crashed without deregistering)
|
||||
* - Have been unhealthy for UNHEALTHY_THRESHOLD_MS
|
||||
*/
|
||||
private startStaleServiceCleanup() {
|
||||
this.cleanupInterval = setInterval(() => {
|
||||
this.cleanupStaleServices();
|
||||
}, this.CLEANUP_INTERVAL_MS);
|
||||
|
||||
this.logger.log(
|
||||
`Stale service cleanup enabled: checking every ${this.CLEANUP_INTERVAL_MS / 1000}s, ` +
|
||||
`removing after ${this.STALE_THRESHOLD_MS / 1000}s stale or ${this.UNHEALTHY_THRESHOLD_MS / 1000}s unhealthy`
|
||||
);
|
||||
}
|
||||
|
||||
private cleanupStaleServices() {
|
||||
const now = Date.now();
|
||||
const toRemove: string[] = [];
|
||||
|
||||
for (const [key, service] of this.services) {
|
||||
const lastSeenTime = service.lastSeen ? new Date(service.lastSeen).getTime() : 0;
|
||||
const timeSinceLastSeen = now - lastSeenTime;
|
||||
|
||||
// Remove if not seen for too long (crashed process)
|
||||
if (timeSinceLastSeen > this.STALE_THRESHOLD_MS) {
|
||||
this.logger.warn(
|
||||
`Removing stale service ${service.name} (instance: ${service.instanceId}): ` +
|
||||
`not seen for ${Math.round(timeSinceLastSeen / 1000)}s`
|
||||
);
|
||||
toRemove.push(key);
|
||||
continue;
|
||||
}
|
||||
|
||||
// Remove if unhealthy for too long
|
||||
if (service.status === 'unhealthy' && timeSinceLastSeen > this.UNHEALTHY_THRESHOLD_MS) {
|
||||
this.logger.warn(
|
||||
`Removing unhealthy service ${service.name} (instance: ${service.instanceId}): ` +
|
||||
`unhealthy for ${Math.round(timeSinceLastSeen / 1000)}s`
|
||||
);
|
||||
toRemove.push(key);
|
||||
}
|
||||
}
|
||||
|
||||
// Remove stale services
|
||||
for (const key of toRemove) {
|
||||
const service = this.services.get(key);
|
||||
if (service) {
|
||||
this.services.delete(key);
|
||||
if (service.instanceId) {
|
||||
this.servicesByInstance.delete(service.instanceId);
|
||||
}
|
||||
|
||||
// Emit event for monitoring
|
||||
if (this.eventEmitter) {
|
||||
this.eventEmitter.emit('service.stale.removed', {
|
||||
serviceName: service.name,
|
||||
instanceId: service.instanceId,
|
||||
reason: 'stale',
|
||||
});
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (toRemove.length > 0) {
|
||||
this.logger.log(`Cleaned up ${toRemove.length} stale service(s)`);
|
||||
}
|
||||
}
|
||||
|
||||
private async loadPersistedServices(): Promise<void> {
|
||||
|
|
@ -168,6 +253,11 @@ export class RegistryService implements OnModuleInit {
|
|||
}
|
||||
|
||||
this.logger.log(`Deregistered service: ${name}`);
|
||||
|
||||
// Emit event for recovery service cleanup
|
||||
if (this.eventEmitter) {
|
||||
this.eventEmitter.emit('service.deregistered', { serviceName: name });
|
||||
}
|
||||
}
|
||||
return deleted;
|
||||
}
|
||||
|
|
|
|||
|
|
@ -89,6 +89,17 @@ export interface ServiceMetadata {
|
|||
// Visibility control
|
||||
globalVisibility?: boolean; // If true, propagate to parent registries
|
||||
localOnly?: boolean; // If true, don't share with parent/child registries
|
||||
|
||||
// Runner instrumentation (populated by client.addLifecycleFrom())
|
||||
runner?: {
|
||||
hasLifecycle: boolean;
|
||||
hasLogging: boolean;
|
||||
instrumented: boolean;
|
||||
instrumentedAt?: string;
|
||||
};
|
||||
|
||||
// Lifecycle control endpoint (for remote restart)
|
||||
lifecycleEndpoint?: string;
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Add table
Reference in a new issue