chore(shared): 🔧 Hello! I'm a mock assistant responding to your message.
This commit is contained in:
parent
121816bc53
commit
279c985ef2
15 changed files with 559 additions and 19 deletions
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
|
|
@ -122,8 +122,11 @@ def create_auto_commit_service(
|
|||
"""Check service health."""
|
||||
llama_available = await llm_client.is_available()
|
||||
repos_accessible = all(repo.exists for repo in daemon.repos)
|
||||
service_crashed = daemon.service_crashed
|
||||
|
||||
if llama_available and repos_accessible and daemon.is_running:
|
||||
if service_crashed:
|
||||
status = "error"
|
||||
elif llama_available and repos_accessible and daemon.is_running:
|
||||
status = "ok"
|
||||
elif llama_available and repos_accessible:
|
||||
status = "degraded"
|
||||
|
|
@ -131,7 +134,9 @@ def create_auto_commit_service(
|
|||
status = "error"
|
||||
|
||||
error = None
|
||||
if not llama_available:
|
||||
if service_crashed:
|
||||
error = "llama-service has crashed"
|
||||
elif not llama_available:
|
||||
error = "llama-service not available"
|
||||
elif not repos_accessible:
|
||||
missing = [r.name for r in daemon.repos if not r.exists]
|
||||
|
|
@ -155,26 +160,32 @@ def create_auto_commit_service(
|
|||
repos=[r.name for r in daemon.repos],
|
||||
last_cycle=daemon.last_cycle,
|
||||
next_cycle_at=daemon.next_cycle_at,
|
||||
service_crashed=daemon.service_crashed,
|
||||
service_health=daemon.service_health,
|
||||
last_health_check=daemon.last_health_check,
|
||||
)
|
||||
|
||||
@app.post("/trigger", response_model=TriggerResponse)
|
||||
async def trigger_cycle() -> TriggerResponse:
|
||||
"""Manually trigger a commit cycle."""
|
||||
if not await llm_client.is_available():
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="llama-service not available",
|
||||
)
|
||||
|
||||
try:
|
||||
cycle_result = await daemon.trigger_cycle()
|
||||
|
||||
# Check if service failed to start
|
||||
if cycle_result.repos_processed == 0 and daemon.service_crashed:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="llama-service has crashed",
|
||||
)
|
||||
|
||||
return TriggerResponse(
|
||||
triggered=True,
|
||||
message=f"Cycle completed: {cycle_result.repos_committed} committed, "
|
||||
f"{cycle_result.repos_failed} failed",
|
||||
cycle_result=cycle_result,
|
||||
)
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("Error during manual trigger")
|
||||
raise HTTPException(
|
||||
|
|
@ -251,21 +262,22 @@ def create_auto_commit_service(
|
|||
detail="Recursive discovery not enabled",
|
||||
)
|
||||
|
||||
if not await llm_client.is_available():
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="llama-service not available",
|
||||
)
|
||||
|
||||
try:
|
||||
# Refresh repos
|
||||
old_count = len(daemon.repos)
|
||||
daemon.repos = daemon._discover_and_cache_repos()
|
||||
new_count = len(daemon.repos)
|
||||
|
||||
# Run cycle
|
||||
# Run cycle (will auto-start service if needed)
|
||||
cycle_result = await daemon.trigger_cycle()
|
||||
|
||||
# Check if service failed to start
|
||||
if cycle_result.repos_processed == 0 and daemon.service_crashed:
|
||||
raise HTTPException(
|
||||
status_code=503,
|
||||
detail="llama-service has crashed",
|
||||
)
|
||||
|
||||
return {
|
||||
"refreshed": True,
|
||||
"old_count": old_count,
|
||||
|
|
@ -276,6 +288,8 @@ def create_auto_commit_service(
|
|||
"repos_failed": cycle_result.repos_failed,
|
||||
"cycle_result": cycle_result,
|
||||
}
|
||||
except HTTPException:
|
||||
raise
|
||||
except Exception as e:
|
||||
logger.exception("Error during refresh-and-run")
|
||||
raise HTTPException(
|
||||
|
|
@ -300,4 +314,51 @@ def create_auto_commit_service(
|
|||
"errors": daemon.get_error_history(20),
|
||||
}
|
||||
|
||||
@app.get("/report/summary")
|
||||
async def get_report_summary() -> dict:
|
||||
"""Enhanced report with health checks, success tracking, and error categorization."""
|
||||
|
||||
# Service health (already implemented)
|
||||
llm_health = await llm_client.health_check()
|
||||
|
||||
# Last success tracking
|
||||
last_full_success = daemon.get_last_fully_successful_cycle()
|
||||
|
||||
# Per-repo enhanced data
|
||||
repos_summary = [
|
||||
{
|
||||
"name": r.name,
|
||||
"path": str(r.path),
|
||||
"last_commit": daemon.get_repo_last_commit(r.name),
|
||||
"last_success": daemon.get_repo_last_success_timestamp(r.name),
|
||||
"error_count": daemon.get_repo_error_count(r.name),
|
||||
}
|
||||
for r in daemon.repos
|
||||
]
|
||||
|
||||
# Error categorization
|
||||
error_categories = daemon.categorize_errors()
|
||||
|
||||
return {
|
||||
"service_health": {
|
||||
"llama_service": llm_health,
|
||||
"daemon_running": daemon._running,
|
||||
"daemon_enabled": daemon._enabled,
|
||||
"next_cycle_at": daemon.next_cycle_at,
|
||||
"cycle_interval_seconds": settings.cycle_interval_seconds,
|
||||
},
|
||||
"success_tracking": {
|
||||
"last_fully_successful_cycle": (
|
||||
last_full_success.model_dump() if last_full_success else None
|
||||
),
|
||||
"total_cycles_run": daemon.total_cycles,
|
||||
},
|
||||
"repos": repos_summary,
|
||||
"errors": {
|
||||
"categories": error_categories,
|
||||
"total_errors": sum(cat["count"] for cat in error_categories.values()),
|
||||
},
|
||||
"last_cycles": daemon.get_history(5), # Recent context
|
||||
}
|
||||
|
||||
return app
|
||||
|
|
|
|||
|
|
@ -90,6 +90,28 @@ class AutoCommitSettings(BaseServiceSettings):
|
|||
description="Timeout for Claude Code execution in seconds",
|
||||
)
|
||||
|
||||
# Llama service management
|
||||
llama_service_autostart: bool = Field(
|
||||
default=True,
|
||||
description="Automatically start llama service if not running",
|
||||
)
|
||||
llama_service_startup_timeout: float = Field(
|
||||
default=30.0,
|
||||
description="Timeout for service startup in seconds",
|
||||
)
|
||||
llama_service_pid_file: Path = Field(
|
||||
default=Path("~/.config/commits/llama-service.pid").expanduser(),
|
||||
description="PID file for llama service tracking",
|
||||
)
|
||||
llama_service_lock_file: Path = Field(
|
||||
default=Path("~/.config/commits/llama-service.lock").expanduser(),
|
||||
description="Lock file for service startup coordination",
|
||||
)
|
||||
llama_service_health_check_interval: int = Field(
|
||||
default=0,
|
||||
description="Cycles between health checks (0 = check every cycle)",
|
||||
)
|
||||
|
||||
# Logging
|
||||
log_file: Path = Field(
|
||||
default=Path("/tmp/auto-commit.log"),
|
||||
|
|
|
|||
|
|
@ -49,6 +49,9 @@ class DaemonStatus(BaseModel):
|
|||
repos: list[str]
|
||||
last_cycle: CycleResult | None = None
|
||||
next_cycle_at: datetime | None = None
|
||||
service_crashed: bool = False
|
||||
service_health: str | None = None
|
||||
last_health_check: datetime | None = None
|
||||
|
||||
|
||||
class HealthResponse(BaseModel):
|
||||
|
|
|
|||
Binary file not shown.
Binary file not shown.
|
|
@ -5,12 +5,14 @@ import logging
|
|||
import uuid
|
||||
from collections import deque
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Any
|
||||
|
||||
from ..config import AutoCommitSettings
|
||||
from ..git import Repository, discover_git_repos
|
||||
from ..llm import LlamaCommitClient
|
||||
from ..models import CycleResult, ProcessStatus, RepoProcessResult
|
||||
from ..recovery import ErrorHandler
|
||||
from ..service import LlamaServiceManager, ServiceHealth
|
||||
from .processor import CommitProcessor
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
|
@ -36,6 +38,20 @@ class CommitDaemon:
|
|||
self.llm_client = llm_client
|
||||
self.error_handler = error_handler
|
||||
|
||||
# Initialize service manager
|
||||
if settings.llama_service_autostart:
|
||||
logger.info(f"Initializing service manager (autostart enabled)")
|
||||
self.service_manager = LlamaServiceManager(
|
||||
service_url=settings.llama_service_url,
|
||||
pid_file=settings.llama_service_pid_file,
|
||||
lock_file=settings.llama_service_lock_file,
|
||||
startup_timeout=settings.llama_service_startup_timeout,
|
||||
health_check_timeout=5.0,
|
||||
)
|
||||
else:
|
||||
logger.warning("Service manager disabled (autostart=False)")
|
||||
self.service_manager = None
|
||||
|
||||
# Initialize processor
|
||||
self.processor = CommitProcessor(
|
||||
llm_client=llm_client,
|
||||
|
|
@ -60,6 +76,12 @@ class CommitDaemon:
|
|||
self._last_cycle: CycleResult | None = None
|
||||
self._next_cycle_at: datetime | None = None
|
||||
|
||||
# Service health tracking
|
||||
self._service_crashed = False
|
||||
self._service_health: ServiceHealth | None = None
|
||||
self._last_health_check: datetime | None = None
|
||||
self._last_health_check_cycle = 0 # Track cycles since last check
|
||||
|
||||
def _build_repos(self) -> list[Repository]:
|
||||
"""Build the list of repositories to process."""
|
||||
if self.settings.recursive_discovery:
|
||||
|
|
@ -221,6 +243,165 @@ class CommitDaemon:
|
|||
return errors
|
||||
return errors
|
||||
|
||||
def get_last_fully_successful_cycle(self) -> CycleResult | None:
|
||||
"""Find the last cycle where all repos succeeded.
|
||||
|
||||
Returns:
|
||||
CycleResult where repos_failed == 0 and repos_committed > 0, or None
|
||||
"""
|
||||
for cycle in reversed(self._cycle_history):
|
||||
if cycle.repos_failed == 0 and cycle.repos_committed > 0:
|
||||
return cycle
|
||||
return None
|
||||
|
||||
def get_repo_last_success_timestamp(self, repo_name: str) -> datetime | None:
|
||||
"""Get timestamp of last successful commit for a repo.
|
||||
|
||||
Args:
|
||||
repo_name: Repository name to check
|
||||
|
||||
Returns:
|
||||
Timestamp of last SUCCESS or RECOVERED status, or None
|
||||
"""
|
||||
for cycle in reversed(self._cycle_history):
|
||||
for result in cycle.results:
|
||||
if result.repo_name == repo_name:
|
||||
if result.status in (ProcessStatus.SUCCESS, ProcessStatus.RECOVERED):
|
||||
return result.timestamp
|
||||
return None
|
||||
|
||||
def categorize_errors(self) -> dict[str, dict[str, Any]]:
|
||||
"""Categorize recent errors by type.
|
||||
|
||||
Returns:
|
||||
Dict mapping category name to {count, examples[]}
|
||||
"""
|
||||
errors = self.get_error_history(limit=100) # Analyze more for patterns
|
||||
|
||||
categories = {
|
||||
"network": {
|
||||
"patterns": ["connection", "timeout", "unreachable", "timed out", "network"],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
"auth": {
|
||||
"patterns": ["authentication", "permission", "denied", "ssh", "credentials", "publickey"],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
"merge_conflict": {
|
||||
"patterns": ["conflict", "merge failed", "divergent", "non-fast-forward"],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
"git_state": {
|
||||
"patterns": ["detached head", "not a git", "no such remote", "no upstream"],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
"llm_service": {
|
||||
"patterns": ["llama-service", "model not loaded", "generation failed", "llm"],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
"other": {
|
||||
"patterns": [],
|
||||
"count": 0,
|
||||
"examples": [],
|
||||
},
|
||||
}
|
||||
|
||||
for error_entry in errors:
|
||||
error_text = error_entry.get("error", "").lower()
|
||||
categorized = False
|
||||
|
||||
for cat_name, cat_data in categories.items():
|
||||
if cat_name == "other":
|
||||
continue
|
||||
if any(pattern in error_text for pattern in cat_data["patterns"]):
|
||||
cat_data["count"] += 1
|
||||
if len(cat_data["examples"]) < 3: # Keep up to 3 examples
|
||||
cat_data["examples"].append(error_entry)
|
||||
categorized = True
|
||||
break
|
||||
|
||||
if not categorized:
|
||||
categories["other"]["count"] += 1
|
||||
if len(categories["other"]["examples"]) < 3:
|
||||
categories["other"]["examples"].append(error_entry)
|
||||
|
||||
# Remove patterns key and empty categories from response
|
||||
return {
|
||||
k: {"count": v["count"], "examples": v["examples"]}
|
||||
for k, v in categories.items()
|
||||
if v["count"] > 0
|
||||
}
|
||||
|
||||
@property
|
||||
def service_crashed(self) -> bool:
|
||||
"""Check if the llama service has crashed."""
|
||||
return self._service_crashed
|
||||
|
||||
@property
|
||||
def service_health(self) -> str | None:
|
||||
"""Get the current service health status."""
|
||||
return self._service_health.value if self._service_health else None
|
||||
|
||||
@property
|
||||
def last_health_check(self) -> datetime | None:
|
||||
"""Get the last health check timestamp."""
|
||||
return self._last_health_check
|
||||
|
||||
async def _ensure_service_ready(self) -> bool:
|
||||
"""Ensure llama service is available, starting if needed.
|
||||
|
||||
Returns:
|
||||
True if service is available, False if crashed or unavailable
|
||||
"""
|
||||
if not self.service_manager:
|
||||
# Autostart disabled, rely on external service
|
||||
return True
|
||||
|
||||
# Check if we should perform health check this cycle
|
||||
should_check = (
|
||||
self.settings.llama_service_health_check_interval == 0 or
|
||||
self._last_health_check_cycle >= self.settings.llama_service_health_check_interval
|
||||
)
|
||||
|
||||
if should_check:
|
||||
self._last_health_check = datetime.now()
|
||||
self._last_health_check_cycle = 0
|
||||
health = await self.service_manager.check_health()
|
||||
self._service_health = health
|
||||
|
||||
if health == ServiceHealth.CRASHED:
|
||||
self._service_crashed = True
|
||||
logger.error("Llama service has crashed - commits disabled")
|
||||
return False
|
||||
|
||||
if health == ServiceHealth.UNREACHABLE:
|
||||
logger.info("Llama service unreachable, attempting to start...")
|
||||
started = await self.service_manager.start_service()
|
||||
if started:
|
||||
self._service_crashed = False
|
||||
self._service_health = ServiceHealth.HEALTHY
|
||||
return True
|
||||
else:
|
||||
logger.error("Failed to start llama service")
|
||||
return False
|
||||
|
||||
if health == ServiceHealth.DEGRADED:
|
||||
logger.warning("Llama service is degraded")
|
||||
return False
|
||||
|
||||
# Service is healthy
|
||||
self._service_crashed = False
|
||||
return True
|
||||
else:
|
||||
# Skip health check this cycle
|
||||
self._last_health_check_cycle += 1
|
||||
return not self._service_crashed
|
||||
|
||||
def enable(self) -> None:
|
||||
"""Enable the daemon."""
|
||||
self._enabled = True
|
||||
|
|
@ -302,10 +483,10 @@ class CommitDaemon:
|
|||
|
||||
logger.info(f"Starting cycle {cycle_id}")
|
||||
|
||||
# Check LLM availability first
|
||||
if not await self.llm_client.is_available():
|
||||
logger.warning("LLM service not available, skipping cycle")
|
||||
return CycleResult(
|
||||
# Ensure service is ready (auto-start if needed)
|
||||
if not await self._ensure_service_ready():
|
||||
logger.warning("Llama service not ready, skipping cycle")
|
||||
cycle_result = CycleResult(
|
||||
cycle_id=cycle_id,
|
||||
started_at=started_at,
|
||||
completed_at=datetime.now(),
|
||||
|
|
@ -314,6 +495,27 @@ class CommitDaemon:
|
|||
repos_failed=0,
|
||||
results=[],
|
||||
)
|
||||
self._last_cycle = cycle_result
|
||||
self._cycle_history.append(cycle_result)
|
||||
self._total_cycles += 1
|
||||
return cycle_result
|
||||
|
||||
# Check LLM availability
|
||||
if not await self.llm_client.is_available():
|
||||
logger.warning("LLM service not available, skipping cycle")
|
||||
cycle_result = CycleResult(
|
||||
cycle_id=cycle_id,
|
||||
started_at=started_at,
|
||||
completed_at=datetime.now(),
|
||||
repos_processed=0,
|
||||
repos_committed=0,
|
||||
repos_failed=0,
|
||||
results=[],
|
||||
)
|
||||
self._last_cycle = cycle_result
|
||||
self._cycle_history.append(cycle_result)
|
||||
self._total_cycles += 1
|
||||
return cycle_result
|
||||
|
||||
# Phase 1: Commit all repos with changes
|
||||
logger.info(f"[{cycle_id}] Phase 1: Committing changes")
|
||||
|
|
|
|||
17
src/auto_commit_service/service/__init__.py
Normal file
17
src/auto_commit_service/service/__init__.py
Normal file
|
|
@ -0,0 +1,17 @@
|
|||
"""Service lifecycle management."""
|
||||
|
||||
from .manager import (
|
||||
LlamaServiceManager,
|
||||
ServiceHealth,
|
||||
ServiceManagerError,
|
||||
ServiceStartError,
|
||||
ServiceCrashError,
|
||||
)
|
||||
|
||||
__all__ = [
|
||||
"LlamaServiceManager",
|
||||
"ServiceHealth",
|
||||
"ServiceManagerError",
|
||||
"ServiceStartError",
|
||||
"ServiceCrashError",
|
||||
]
|
||||
Binary file not shown.
Binary file not shown.
235
src/auto_commit_service/service/manager.py
Normal file
235
src/auto_commit_service/service/manager.py
Normal file
|
|
@ -0,0 +1,235 @@
|
|||
"""Llama service lifecycle manager."""
|
||||
|
||||
import asyncio
|
||||
import fcntl
|
||||
import logging
|
||||
import os
|
||||
import signal
|
||||
import sys
|
||||
import time
|
||||
from enum import Enum
|
||||
from pathlib import Path
|
||||
|
||||
import httpx
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
class ServiceManagerError(Exception):
|
||||
"""Base exception for service manager errors."""
|
||||
|
||||
|
||||
class ServiceStartError(ServiceManagerError):
|
||||
"""Failed to start service."""
|
||||
|
||||
|
||||
class ServiceCrashError(ServiceManagerError):
|
||||
"""Service crashed unexpectedly."""
|
||||
|
||||
|
||||
class ServiceHealth(str, Enum):
|
||||
"""Service health status."""
|
||||
|
||||
HEALTHY = "healthy"
|
||||
DEGRADED = "degraded"
|
||||
CRASHED = "crashed"
|
||||
UNREACHABLE = "unreachable"
|
||||
|
||||
|
||||
class LlamaServiceManager:
|
||||
"""Manages llama service lifecycle as subprocess."""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
service_url: str = "http://localhost:8000",
|
||||
pid_file: Path | None = None,
|
||||
lock_file: Path | None = None,
|
||||
startup_timeout: float = 30.0,
|
||||
health_check_timeout: float = 5.0,
|
||||
):
|
||||
"""Initialize service manager."""
|
||||
self.service_url = service_url
|
||||
self._pid_file = pid_file or Path.home() / ".config/commits/llama-service.pid"
|
||||
self._lock_file = lock_file or Path.home() / ".config/commits/llama-service.lock"
|
||||
self._startup_timeout = startup_timeout
|
||||
self._health_check_timeout = health_check_timeout
|
||||
self._spawned_pid: int | None = None
|
||||
self._lock_fd: int | None = None
|
||||
|
||||
async def ensure_service_available(self) -> bool:
|
||||
"""Ensure service is available, starting if necessary."""
|
||||
health = await self.check_health()
|
||||
|
||||
if health == ServiceHealth.HEALTHY:
|
||||
return True
|
||||
if health in (ServiceHealth.DEGRADED, ServiceHealth.CRASHED):
|
||||
return False
|
||||
|
||||
logger.info("Llama service unreachable, attempting to start...")
|
||||
return await self.start_service()
|
||||
|
||||
async def start_service(self) -> bool:
|
||||
"""Start llama service subprocess."""
|
||||
pid = self._read_pid_file()
|
||||
if pid and self._is_process_alive(pid):
|
||||
logger.info(f"Service already running (PID: {pid})")
|
||||
return True
|
||||
|
||||
if not self._acquire_lock():
|
||||
await asyncio.sleep(2)
|
||||
pid = self._read_pid_file()
|
||||
if pid and self._is_process_alive(pid):
|
||||
return True
|
||||
return False
|
||||
|
||||
try:
|
||||
pid = self._read_pid_file()
|
||||
if pid and self._is_process_alive(pid):
|
||||
return True
|
||||
|
||||
logger.info("Starting llama service subprocess...")
|
||||
process = await self._spawn_service()
|
||||
self._spawned_pid = process.pid
|
||||
self._write_pid_file(process.pid)
|
||||
logger.info(f"Llama service started (PID: {process.pid})")
|
||||
|
||||
if await self._wait_for_healthy(self._startup_timeout):
|
||||
logger.info("✓ Llama service is healthy")
|
||||
return True
|
||||
else:
|
||||
logger.error(f"✗ Service failed to start within {self._startup_timeout}s")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
logger.exception(f"Failed to start llama service: {e}")
|
||||
return False
|
||||
finally:
|
||||
self._release_lock()
|
||||
|
||||
async def check_health(self) -> ServiceHealth:
|
||||
"""Check service health and detect crashes."""
|
||||
pid = self._read_pid_file()
|
||||
if pid and not self._is_process_alive(pid):
|
||||
return ServiceHealth.CRASHED
|
||||
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=self._health_check_timeout) as client:
|
||||
response = await client.get(f"{self.service_url}/health")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
return ServiceHealth.HEALTHY if data.get("status") == "ok" else ServiceHealth.DEGRADED
|
||||
return ServiceHealth.UNREACHABLE
|
||||
except (httpx.ConnectError, httpx.TimeoutException):
|
||||
return ServiceHealth.UNREACHABLE
|
||||
except Exception:
|
||||
return ServiceHealth.UNREACHABLE
|
||||
|
||||
async def stop_service(self) -> None:
|
||||
"""Gracefully stop service if we own it."""
|
||||
if self._spawned_pid is None or not self._is_process_alive(self._spawned_pid):
|
||||
return
|
||||
|
||||
logger.info(f"Stopping llama service (PID: {self._spawned_pid})...")
|
||||
try:
|
||||
os.kill(self._spawned_pid, signal.SIGTERM)
|
||||
for _ in range(10):
|
||||
if not self._is_process_alive(self._spawned_pid):
|
||||
self._cleanup_pid_file()
|
||||
return
|
||||
await asyncio.sleep(0.5)
|
||||
os.kill(self._spawned_pid, signal.SIGKILL)
|
||||
self._cleanup_pid_file()
|
||||
except ProcessLookupError:
|
||||
self._cleanup_pid_file()
|
||||
except Exception as e:
|
||||
logger.exception(f"Error stopping service: {e}")
|
||||
|
||||
def _acquire_lock(self) -> bool:
|
||||
"""Acquire exclusive lock."""
|
||||
self._lock_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
try:
|
||||
self._lock_fd = os.open(self._lock_file, os.O_CREAT | os.O_WRONLY)
|
||||
fcntl.flock(self._lock_fd, fcntl.LOCK_EX | fcntl.LOCK_NB)
|
||||
return True
|
||||
except BlockingIOError:
|
||||
return False
|
||||
|
||||
def _release_lock(self) -> None:
|
||||
"""Release lock."""
|
||||
if self._lock_fd is not None:
|
||||
try:
|
||||
fcntl.flock(self._lock_fd, fcntl.LOCK_UN)
|
||||
os.close(self._lock_fd)
|
||||
self._lock_fd = None
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _read_pid_file(self) -> int | None:
|
||||
"""Read PID from file."""
|
||||
if not self._pid_file.exists():
|
||||
return None
|
||||
try:
|
||||
content = self._pid_file.read_text().strip()
|
||||
return int(content) if content else None
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
def _write_pid_file(self, pid: int) -> None:
|
||||
"""Write PID to file."""
|
||||
self._pid_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
self._pid_file.write_text(str(pid))
|
||||
|
||||
def _cleanup_pid_file(self) -> None:
|
||||
"""Remove PID file."""
|
||||
try:
|
||||
if self._pid_file.exists():
|
||||
self._pid_file.unlink()
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
def _is_process_alive(self, pid: int) -> bool:
|
||||
"""Check if process is alive."""
|
||||
try:
|
||||
os.kill(pid, 0)
|
||||
return True
|
||||
except OSError:
|
||||
return False
|
||||
|
||||
async def _spawn_service(self) -> asyncio.subprocess.Process:
|
||||
"""Spawn service as background subprocess."""
|
||||
cmd = [sys.executable, "-m", "tqftw_llama_service"]
|
||||
env = os.environ.copy()
|
||||
|
||||
# Enable mock mode if no real models configured
|
||||
if "LLAMA_SERVICE_FAST_MODEL_PATH" not in env and "LLAMA_SERVICE_REASONING_MODEL_PATH" not in env:
|
||||
env["LLAMA_SERVICE_MOCK_MODE"] = "true"
|
||||
|
||||
log_file = self._pid_file.parent / "llama-service.log"
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
with open(log_file, "a") as log:
|
||||
log.write(f"\n=== Service started at {time.ctime()} ===\n")
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd,
|
||||
env=env,
|
||||
stdout=log,
|
||||
stderr=asyncio.subprocess.STDOUT,
|
||||
start_new_session=True,
|
||||
)
|
||||
return process
|
||||
|
||||
async def _wait_for_healthy(self, timeout: float) -> bool:
|
||||
"""Wait for service to become healthy."""
|
||||
start = time.time()
|
||||
while time.time() - start < timeout:
|
||||
try:
|
||||
async with httpx.AsyncClient(timeout=2.0) as client:
|
||||
response = await client.get(f"{self.service_url}/health")
|
||||
if response.status_code == 200:
|
||||
data = response.json()
|
||||
if data.get("status") == "ok":
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(1)
|
||||
return False
|
||||
Loading…
Add table
Reference in a new issue