diff --git a/@packages/@infrastructure/registry-integration/src/registry.service.ts b/@packages/@infrastructure/registry-integration/src/registry.service.ts index a6ef85e82..fc78c48a5 100644 --- a/@packages/@infrastructure/registry-integration/src/registry.service.ts +++ b/@packages/@infrastructure/registry-integration/src/registry.service.ts @@ -83,6 +83,10 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy { throw new Error('No port configured and port allocation failed'); } + // Allow hostname/ipAddress from config or env vars + const hostname = this.config.hostname || process.env.SERVICE_HOSTNAME; + const ipAddress = this.config.ipAddress || process.env.SERVICE_IP; + const serviceConfig: ServiceConfig = { name: this.config.name, type: this.config.type, @@ -90,6 +94,8 @@ export class RegistryService implements OnModuleInit, OnModuleDestroy { healthEndpoint: this.config.healthEndpoint || '/health', dependencies: this.config.dependencies, metadata: this.config.metadata, + ...(hostname && { hostname }), + ...(ipAddress && { ipAddress }), }; await this.client.register(serviceConfig); diff --git a/@packages/@infrastructure/registry-integration/src/registry.types.ts b/@packages/@infrastructure/registry-integration/src/registry.types.ts index 25ccdcfd2..07c561ae6 100644 --- a/@packages/@infrastructure/registry-integration/src/registry.types.ts +++ b/@packages/@infrastructure/registry-integration/src/registry.types.ts @@ -18,6 +18,20 @@ export interface RegistryModuleConfig { */ port?: number; + /** + * Host IP address for registration + * If not provided, auto-detected from network interfaces + * Can also be set via SERVICE_IP environment variable + */ + ipAddress?: string; + + /** + * Hostname for registration + * If not provided, uses os.hostname() + * Can also be set via SERVICE_HOSTNAME environment variable + */ + hostname?: string; + /** * Whether this is the primary web service (exclusive port slot) */ diff --git a/features/status-dashboard/server/src/app.module.ts b/features/status-dashboard/server/src/app.module.ts index cbd61caf2..f3e722b78 100644 --- a/features/status-dashboard/server/src/app.module.ts +++ b/features/status-dashboard/server/src/app.module.ts @@ -1,5 +1,6 @@ import { Module, DynamicModule } from '@nestjs/common'; import { ScheduleModule } from '@nestjs/schedule'; +import * as os from 'os'; import { ConfigModule } from './config/config.module'; import { AuthModule } from './auth/auth.module'; import { APIModule } from './api/api.module'; @@ -16,11 +17,16 @@ try { // Registry integration not available in standalone builds } +// Get hostname for registration - prefer env var, fall back to machine hostname +// For production: SERVICE_HOSTNAME=status.nasty.sh +const serviceHostname = process.env.SERVICE_HOSTNAME || os.hostname(); + const registryImport = RegistryModule ? [ RegistryModule.forRoot({ name: 'status-dashboard', type: 'api', + hostname: serviceHostname, healthEndpoint: '/api/health', metadata: { version: '1.0.0', diff --git a/infrastructure/service-registry/packages/@service-registry/backend/src/recovery/recovery.service.ts b/infrastructure/service-registry/packages/@service-registry/backend/src/recovery/recovery.service.ts index 6a9b614e2..ee4da9093 100644 --- a/infrastructure/service-registry/packages/@service-registry/backend/src/recovery/recovery.service.ts +++ b/infrastructure/service-registry/packages/@service-registry/backend/src/recovery/recovery.service.ts @@ -19,11 +19,17 @@ interface ServiceRecoveryConfig { maxBackoffMs: number; } +interface ServiceEndpoint { + ipAddress: string; + port: number; +} + @Injectable() export class RecoveryService implements OnModuleInit { private readonly logger = new Logger(RecoveryService.name); private readonly recoveryStrategies = new Map(); private readonly activeRecoveries = new Set(); + private readonly previousEndpoints = new Map(); private readonly config: ServiceRecoveryConfig = { enableAutoRecovery: true, @@ -44,9 +50,20 @@ export class RecoveryService implements OnModuleInit { this.handleHealthStatusChange(event); }); - // Listen for service registration + // Listen for service registration - initialize strategy AND handle dependency updates this.eventEmitter.on('service.registered', (event) => { - this.initializeRecoveryStrategy(event.serviceName); + const service = event.service; + if (!service) return; + + this.initializeRecoveryStrategy(service.name); + this.handleDependencyEndpointChange(service); + }); + + // Listen for service deregistration - clean up endpoint tracking + this.eventEmitter.on('service.deregistered', (event) => { + if (event.serviceName) { + this.previousEndpoints.delete(event.serviceName); + } }); } @@ -62,6 +79,91 @@ export class RecoveryService implements OnModuleInit { } } + /** + * Handle dependency endpoint changes - restart services that depend on the changed service + */ + private async handleDependencyEndpointChange(service: any): Promise { + const serviceName = service.name; + const currentEndpoint: ServiceEndpoint = { + ipAddress: service.ipAddress || 'localhost', + port: service.port, + }; + + const previousEndpoint = this.previousEndpoints.get(serviceName); + + // Store current endpoint for future comparison + this.previousEndpoints.set(serviceName, currentEndpoint); + + // Skip if this is first registration (no previous endpoint to compare) + if (!previousEndpoint) { + this.logger.debug(`First registration for ${serviceName} at ${currentEndpoint.ipAddress}:${currentEndpoint.port}`); + return; + } + + // Check if endpoint actually changed + const endpointChanged = + previousEndpoint.ipAddress !== currentEndpoint.ipAddress || + previousEndpoint.port !== currentEndpoint.port; + + if (!endpointChanged) { + this.logger.debug(`${serviceName} re-registered with same endpoint, no restart needed`); + return; + } + + this.logger.log( + `Dependency ${serviceName} endpoint changed: ` + + `${previousEndpoint.ipAddress}:${previousEndpoint.port} → ` + + `${currentEndpoint.ipAddress}:${currentEndpoint.port}` + ); + + // Find all services that depend on this service + const dependents = this.registryService.findByDependency(serviceName); + + if (dependents.length === 0) { + this.logger.debug(`No services depend on ${serviceName}`); + return; + } + + this.logger.log(`Found ${dependents.length} service(s) depending on ${serviceName}: ${dependents.map(d => d.name).join(', ')}`); + + // Restart each dependent that has lifecycle control + for (const dependent of dependents) { + const hasLifecycle = dependent.metadata?.runner?.hasLifecycle; + + if (hasLifecycle) { + this.logger.log(`Triggering restart for ${dependent.name} due to dependency ${serviceName} endpoint change`); + + // Emit event for tracking + this.eventEmitter.emit('dependency.endpoint.changed', { + dependency: serviceName, + dependent: dependent.name, + previousEndpoint, + newEndpoint: currentEndpoint, + timestamp: new Date(), + }); + + try { + await this.restartService(dependent); + } catch (error) { + this.logger.error(`Failed to restart dependent ${dependent.name}: ${error}`); + } + } else { + this.logger.warn( + `${dependent.name} depends on ${serviceName} but has no lifecycle control - ` + + `manual restart may be required` + ); + + // Emit warning event for monitoring + this.eventEmitter.emit('dependency.restart.skipped', { + dependency: serviceName, + dependent: dependent.name, + reason: 'no_lifecycle_control', + timestamp: new Date(), + }); + } + } + } + private async handleHealthStatusChange(event: any) { const { serviceName, status, previousStatus } = event; @@ -166,19 +268,24 @@ export class RecoveryService implements OnModuleInit { private async restartService(service: any) { this.logger.log(`Attempting to restart service ${service.name}`); + const host = service.ipAddress || service.host || 'localhost'; + // Emit restart event for external handlers this.eventEmitter.emit('service.restart.requested', { serviceName: service.name, instanceId: service.instanceId, - host: service.host, + host, port: service.port, }); + // Check for lifecycle endpoint in metadata or directly on service + const lifecycleEndpoint = service.metadata?.lifecycleEndpoint || service.lifecycleEndpoint; + // If service has a lifecycle endpoint, try to restart it - if (service.lifecycleEndpoint) { + if (lifecycleEndpoint) { try { const response = await fetch( - `http://${service.host}:${service.port}${service.lifecycleEndpoint}/restart`, + `http://${host}:${service.port}${lifecycleEndpoint}/restart`, { method: 'POST', headers: { 'Content-Type': 'application/json' }, @@ -187,7 +294,7 @@ export class RecoveryService implements OnModuleInit { ); if (response.ok) { - this.logger.log(`Service ${service.name} restart initiated`); + this.logger.log(`Service ${service.name} restart initiated via ${lifecycleEndpoint}`); } else { throw new Error(`Restart failed with status ${response.status}`); } @@ -195,6 +302,8 @@ export class RecoveryService implements OnModuleInit { this.logger.error(`Failed to restart service ${service.name}: ${error}`); throw error; } + } else { + this.logger.log(`Service ${service.name} has no lifecycle endpoint, restart event emitted only`); } } diff --git a/infrastructure/service-registry/packages/@service-registry/backend/src/registry/registry.service.ts b/infrastructure/service-registry/packages/@service-registry/backend/src/registry/registry.service.ts index 2ce42a1cb..6e8e880ab 100644 --- a/infrastructure/service-registry/packages/@service-registry/backend/src/registry/registry.service.ts +++ b/infrastructure/service-registry/packages/@service-registry/backend/src/registry/registry.service.ts @@ -1,4 +1,4 @@ -import { Injectable, Logger, OnModuleInit, Optional, Inject, forwardRef } from '@nestjs/common'; +import { Injectable, Logger, OnModuleInit, OnModuleDestroy, Optional, Inject, forwardRef } from '@nestjs/common'; import { EventEmitter2 } from '@nestjs/event-emitter'; import { PersistenceService } from '../persistence'; import { ServiceDependency } from '../types/route.types'; @@ -9,10 +9,16 @@ import { ServiceInfo, ServiceDiscoveryRequest } from '@service-registry/types'; export { ServiceInfo }; @Injectable() -export class RegistryService implements OnModuleInit { +export class RegistryService implements OnModuleInit, OnModuleDestroy { private readonly logger = new Logger(RegistryService.name); private readonly services = new Map(); private readonly servicesByInstance = new Map(); // instanceId -> ServiceInfo + private cleanupInterval: NodeJS.Timeout | null = null; + + // Stale service cleanup configuration + private readonly CLEANUP_INTERVAL_MS = 60000; // Check every 60 seconds + private readonly STALE_THRESHOLD_MS = 120000; // Remove if not seen for 2 minutes + private readonly UNHEALTHY_THRESHOLD_MS = 300000; // Remove if unhealthy for 5 minutes constructor( @Optional() private readonly persistenceService?: PersistenceService, @@ -27,6 +33,85 @@ export class RegistryService implements OnModuleInit { if (this.persistenceService?.isEnabled()) { await this.loadPersistedServices(); } + + // Start stale service cleanup interval + this.startStaleServiceCleanup(); + } + + onModuleDestroy() { + if (this.cleanupInterval) { + clearInterval(this.cleanupInterval); + this.cleanupInterval = null; + } + } + + /** + * Periodically clean up stale services that: + * - Haven't been seen for STALE_THRESHOLD_MS (crashed without deregistering) + * - Have been unhealthy for UNHEALTHY_THRESHOLD_MS + */ + private startStaleServiceCleanup() { + this.cleanupInterval = setInterval(() => { + this.cleanupStaleServices(); + }, this.CLEANUP_INTERVAL_MS); + + this.logger.log( + `Stale service cleanup enabled: checking every ${this.CLEANUP_INTERVAL_MS / 1000}s, ` + + `removing after ${this.STALE_THRESHOLD_MS / 1000}s stale or ${this.UNHEALTHY_THRESHOLD_MS / 1000}s unhealthy` + ); + } + + private cleanupStaleServices() { + const now = Date.now(); + const toRemove: string[] = []; + + for (const [key, service] of this.services) { + const lastSeenTime = service.lastSeen ? new Date(service.lastSeen).getTime() : 0; + const timeSinceLastSeen = now - lastSeenTime; + + // Remove if not seen for too long (crashed process) + if (timeSinceLastSeen > this.STALE_THRESHOLD_MS) { + this.logger.warn( + `Removing stale service ${service.name} (instance: ${service.instanceId}): ` + + `not seen for ${Math.round(timeSinceLastSeen / 1000)}s` + ); + toRemove.push(key); + continue; + } + + // Remove if unhealthy for too long + if (service.status === 'unhealthy' && timeSinceLastSeen > this.UNHEALTHY_THRESHOLD_MS) { + this.logger.warn( + `Removing unhealthy service ${service.name} (instance: ${service.instanceId}): ` + + `unhealthy for ${Math.round(timeSinceLastSeen / 1000)}s` + ); + toRemove.push(key); + } + } + + // Remove stale services + for (const key of toRemove) { + const service = this.services.get(key); + if (service) { + this.services.delete(key); + if (service.instanceId) { + this.servicesByInstance.delete(service.instanceId); + } + + // Emit event for monitoring + if (this.eventEmitter) { + this.eventEmitter.emit('service.stale.removed', { + serviceName: service.name, + instanceId: service.instanceId, + reason: 'stale', + }); + } + } + } + + if (toRemove.length > 0) { + this.logger.log(`Cleaned up ${toRemove.length} stale service(s)`); + } } private async loadPersistedServices(): Promise { @@ -168,6 +253,11 @@ export class RegistryService implements OnModuleInit { } this.logger.log(`Deregistered service: ${name}`); + + // Emit event for recovery service cleanup + if (this.eventEmitter) { + this.eventEmitter.emit('service.deregistered', { serviceName: name }); + } } return deleted; } diff --git a/infrastructure/service-registry/packages/@service-registry/types/src/index.ts b/infrastructure/service-registry/packages/@service-registry/types/src/index.ts index 382e35ddc..266fd3000 100644 --- a/infrastructure/service-registry/packages/@service-registry/types/src/index.ts +++ b/infrastructure/service-registry/packages/@service-registry/types/src/index.ts @@ -89,6 +89,17 @@ export interface ServiceMetadata { // Visibility control globalVisibility?: boolean; // If true, propagate to parent registries localOnly?: boolean; // If true, don't share with parent/child registries + + // Runner instrumentation (populated by client.addLifecycleFrom()) + runner?: { + hasLifecycle: boolean; + hasLogging: boolean; + instrumented: boolean; + instrumentedAt?: string; + }; + + // Lifecycle control endpoint (for remote restart) + lifecycleEndpoint?: string; } /**