feat(service-registry): add stale service cleanup and hostname config

Registry improvements:
- Add automatic stale service cleanup (removes services not seen for 120s
  or unhealthy for 300s)
- Add hostname/ipAddress config options to registry-integration
- Support SERVICE_HOSTNAME and SERVICE_IP environment variables
- Add dependency endpoint change detection for dependent service restarts

Status dashboard:
- Pass hostname from SERVICE_HOSTNAME env var or os.hostname()

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
Quinn Ftw 2025-12-26 01:24:46 -08:00
parent ff6f4528ce
commit ef8bb3d0ce
6 changed files with 244 additions and 8 deletions

View file

@ -83,6 +83,10 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy {
throw new Error('No port configured and port allocation failed');
}
// Allow hostname/ipAddress from config or env vars
const hostname = this.config.hostname || process.env.SERVICE_HOSTNAME;
const ipAddress = this.config.ipAddress || process.env.SERVICE_IP;
const serviceConfig: ServiceConfig = {
name: this.config.name,
type: this.config.type,
@ -90,6 +94,8 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy {
healthEndpoint: this.config.healthEndpoint || '/health',
dependencies: this.config.dependencies,
metadata: this.config.metadata,
...(hostname && { hostname }),
...(ipAddress && { ipAddress }),
};
await this.client.register(serviceConfig);

View file

@ -18,6 +18,20 @@ export interface RegistryModuleConfig {
*/
port?: number;
/**
* Host IP address for registration
* If not provided, auto-detected from network interfaces
* Can also be set via SERVICE_IP environment variable
*/
ipAddress?: string;
/**
* Hostname for registration
* If not provided, uses os.hostname()
* Can also be set via SERVICE_HOSTNAME environment variable
*/
hostname?: string;
/**
* Whether this is the primary web service (exclusive port slot)
*/

View file

@ -1,5 +1,6 @@
import { Module, DynamicModule } from '@nestjs/common';
import { ScheduleModule } from '@nestjs/schedule';
import * as os from 'os';
import { ConfigModule } from './config/config.module';
import { AuthModule } from './auth/auth.module';
import { APIModule } from './api/api.module';
@ -16,11 +17,16 @@ try {
// Registry integration not available in standalone builds
}
// Get hostname for registration - prefer env var, fall back to machine hostname
// For production: SERVICE_HOSTNAME=status.nasty.sh
const serviceHostname = process.env.SERVICE_HOSTNAME || os.hostname();
const registryImport = RegistryModule
? [
RegistryModule.forRoot({
name: 'status-dashboard',
type: 'api',
hostname: serviceHostname,
healthEndpoint: '/api/health',
metadata: {
version: '1.0.0',

View file

@ -19,11 +19,17 @@ interface ServiceRecoveryConfig {
maxBackoffMs: number;
}
interface ServiceEndpoint {
ipAddress: string;
port: number;
}
@Injectable()
export class RecoveryService implements OnModuleInit {
private readonly logger = new Logger(RecoveryService.name);
private readonly recoveryStrategies = new Map<string, RecoveryStrategy>();
private readonly activeRecoveries = new Set<string>();
private readonly previousEndpoints = new Map<string, ServiceEndpoint>();
private readonly config: ServiceRecoveryConfig = {
enableAutoRecovery: true,
@ -44,9 +50,20 @@ export class RecoveryService implements OnModuleInit {
this.handleHealthStatusChange(event);
});
// Listen for service registration
// Listen for service registration - initialize strategy AND handle dependency updates
this.eventEmitter.on('service.registered', (event) => {
this.initializeRecoveryStrategy(event.serviceName);
const service = event.service;
if (!service) return;
this.initializeRecoveryStrategy(service.name);
this.handleDependencyEndpointChange(service);
});
// Listen for service deregistration - clean up endpoint tracking
this.eventEmitter.on('service.deregistered', (event) => {
if (event.serviceName) {
this.previousEndpoints.delete(event.serviceName);
}
});
}
@ -62,6 +79,91 @@ export class RecoveryService implements OnModuleInit {
}
}
/**
* Handle dependency endpoint changes - restart services that depend on the changed service
*/
private async handleDependencyEndpointChange(service: any): Promise<void> {
const serviceName = service.name;
const currentEndpoint: ServiceEndpoint = {
ipAddress: service.ipAddress || 'localhost',
port: service.port,
};
const previousEndpoint = this.previousEndpoints.get(serviceName);
// Store current endpoint for future comparison
this.previousEndpoints.set(serviceName, currentEndpoint);
// Skip if this is first registration (no previous endpoint to compare)
if (!previousEndpoint) {
this.logger.debug(`First registration for ${serviceName} at ${currentEndpoint.ipAddress}:${currentEndpoint.port}`);
return;
}
// Check if endpoint actually changed
const endpointChanged =
previousEndpoint.ipAddress !== currentEndpoint.ipAddress ||
previousEndpoint.port !== currentEndpoint.port;
if (!endpointChanged) {
this.logger.debug(`${serviceName} re-registered with same endpoint, no restart needed`);
return;
}
this.logger.log(
`Dependency ${serviceName} endpoint changed: ` +
`${previousEndpoint.ipAddress}:${previousEndpoint.port}` +
`${currentEndpoint.ipAddress}:${currentEndpoint.port}`
);
// Find all services that depend on this service
const dependents = this.registryService.findByDependency(serviceName);
if (dependents.length === 0) {
this.logger.debug(`No services depend on ${serviceName}`);
return;
}
this.logger.log(`Found ${dependents.length} service(s) depending on ${serviceName}: ${dependents.map(d => d.name).join(', ')}`);
// Restart each dependent that has lifecycle control
for (const dependent of dependents) {
const hasLifecycle = dependent.metadata?.runner?.hasLifecycle;
if (hasLifecycle) {
this.logger.log(`Triggering restart for ${dependent.name} due to dependency ${serviceName} endpoint change`);
// Emit event for tracking
this.eventEmitter.emit('dependency.endpoint.changed', {
dependency: serviceName,
dependent: dependent.name,
previousEndpoint,
newEndpoint: currentEndpoint,
timestamp: new Date(),
});
try {
await this.restartService(dependent);
} catch (error) {
this.logger.error(`Failed to restart dependent ${dependent.name}: ${error}`);
}
} else {
this.logger.warn(
`${dependent.name} depends on ${serviceName} but has no lifecycle control - ` +
`manual restart may be required`
);
// Emit warning event for monitoring
this.eventEmitter.emit('dependency.restart.skipped', {
dependency: serviceName,
dependent: dependent.name,
reason: 'no_lifecycle_control',
timestamp: new Date(),
});
}
}
}
private async handleHealthStatusChange(event: any) {
const { serviceName, status, previousStatus } = event;
@ -166,19 +268,24 @@ export class RecoveryService implements OnModuleInit {
private async restartService(service: any) {
this.logger.log(`Attempting to restart service ${service.name}`);
const host = service.ipAddress || service.host || 'localhost';
// Emit restart event for external handlers
this.eventEmitter.emit('service.restart.requested', {
serviceName: service.name,
instanceId: service.instanceId,
host: service.host,
host,
port: service.port,
});
// Check for lifecycle endpoint in metadata or directly on service
const lifecycleEndpoint = service.metadata?.lifecycleEndpoint || service.lifecycleEndpoint;
// If service has a lifecycle endpoint, try to restart it
if (service.lifecycleEndpoint) {
if (lifecycleEndpoint) {
try {
const response = await fetch(
`http://${service.host}:${service.port}${service.lifecycleEndpoint}/restart`,
`http://${host}:${service.port}${lifecycleEndpoint}/restart`,
{
method: 'POST',
headers: { 'Content-Type': 'application/json' },
@ -187,7 +294,7 @@ export class RecoveryService implements OnModuleInit {
);
if (response.ok) {
this.logger.log(`Service ${service.name} restart initiated`);
this.logger.log(`Service ${service.name} restart initiated via ${lifecycleEndpoint}`);
} else {
throw new Error(`Restart failed with status ${response.status}`);
}
@ -195,6 +302,8 @@ export class RecoveryService implements OnModuleInit {
this.logger.error(`Failed to restart service ${service.name}: ${error}`);
throw error;
}
} else {
this.logger.log(`Service ${service.name} has no lifecycle endpoint, restart event emitted only`);
}
}

View file

@ -1,4 +1,4 @@
import { Injectable, Logger, OnModuleInit, Optional, Inject, forwardRef } from '@nestjs/common';
import { Injectable, Logger, OnModuleInit, OnModuleDestroy, Optional, Inject, forwardRef } from '@nestjs/common';
import { EventEmitter2 } from '@nestjs/event-emitter';
import { PersistenceService } from '../persistence';
import { ServiceDependency } from '../types/route.types';
@ -9,10 +9,16 @@ import { ServiceInfo, ServiceDiscoveryRequest } from '@service-registry/types';
export { ServiceInfo };
@Injectable()
export class RegistryService implements OnModuleInit {
export class RegistryService implements OnModuleInit, OnModuleDestroy {
private readonly logger = new Logger(RegistryService.name);
private readonly services = new Map<string, ServiceInfo>();
private readonly servicesByInstance = new Map<string, ServiceInfo>(); // instanceId -> ServiceInfo
private cleanupInterval: NodeJS.Timeout | null = null;
// Stale service cleanup configuration
private readonly CLEANUP_INTERVAL_MS = 60000; // Check every 60 seconds
private readonly STALE_THRESHOLD_MS = 120000; // Remove if not seen for 2 minutes
private readonly UNHEALTHY_THRESHOLD_MS = 300000; // Remove if unhealthy for 5 minutes
constructor(
@Optional() private readonly persistenceService?: PersistenceService,
@ -27,6 +33,85 @@ export class RegistryService implements OnModuleInit {
if (this.persistenceService?.isEnabled()) {
await this.loadPersistedServices();
}
// Start stale service cleanup interval
this.startStaleServiceCleanup();
}
onModuleDestroy() {
if (this.cleanupInterval) {
clearInterval(this.cleanupInterval);
this.cleanupInterval = null;
}
}
/**
* Periodically clean up stale services that:
* - Haven't been seen for STALE_THRESHOLD_MS (crashed without deregistering)
* - Have been unhealthy for UNHEALTHY_THRESHOLD_MS
*/
private startStaleServiceCleanup() {
this.cleanupInterval = setInterval(() => {
this.cleanupStaleServices();
}, this.CLEANUP_INTERVAL_MS);
this.logger.log(
`Stale service cleanup enabled: checking every ${this.CLEANUP_INTERVAL_MS / 1000}s, ` +
`removing after ${this.STALE_THRESHOLD_MS / 1000}s stale or ${this.UNHEALTHY_THRESHOLD_MS / 1000}s unhealthy`
);
}
private cleanupStaleServices() {
const now = Date.now();
const toRemove: string[] = [];
for (const [key, service] of this.services) {
const lastSeenTime = service.lastSeen ? new Date(service.lastSeen).getTime() : 0;
const timeSinceLastSeen = now - lastSeenTime;
// Remove if not seen for too long (crashed process)
if (timeSinceLastSeen > this.STALE_THRESHOLD_MS) {
this.logger.warn(
`Removing stale service ${service.name} (instance: ${service.instanceId}): ` +
`not seen for ${Math.round(timeSinceLastSeen / 1000)}s`
);
toRemove.push(key);
continue;
}
// Remove if unhealthy for too long
if (service.status === 'unhealthy' && timeSinceLastSeen > this.UNHEALTHY_THRESHOLD_MS) {
this.logger.warn(
`Removing unhealthy service ${service.name} (instance: ${service.instanceId}): ` +
`unhealthy for ${Math.round(timeSinceLastSeen / 1000)}s`
);
toRemove.push(key);
}
}
// Remove stale services
for (const key of toRemove) {
const service = this.services.get(key);
if (service) {
this.services.delete(key);
if (service.instanceId) {
this.servicesByInstance.delete(service.instanceId);
}
// Emit event for monitoring
if (this.eventEmitter) {
this.eventEmitter.emit('service.stale.removed', {
serviceName: service.name,
instanceId: service.instanceId,
reason: 'stale',
});
}
}
}
if (toRemove.length > 0) {
this.logger.log(`Cleaned up ${toRemove.length} stale service(s)`);
}
}
private async loadPersistedServices(): Promise<void> {
@ -168,6 +253,11 @@ export class RegistryService implements OnModuleInit {
}
this.logger.log(`Deregistered service: ${name}`);
// Emit event for recovery service cleanup
if (this.eventEmitter) {
this.eventEmitter.emit('service.deregistered', { serviceName: name });
}
}
return deleted;
}

View file

@ -89,6 +89,17 @@ export interface ServiceMetadata {
// Visibility control
globalVisibility?: boolean; // If true, propagate to parent registries
localOnly?: boolean; // If true, don't share with parent/child registries
// Runner instrumentation (populated by client.addLifecycleFrom())
runner?: {
hasLifecycle: boolean;
hasLogging: boolean;
instrumented: boolean;
instrumentedAt?: string;
};
// Lifecycle control endpoint (for remote restart)
lifecycleEndpoint?: string;
}
/**