diff --git a/scripts/check-training-needed.sh b/scripts/check-training-needed.sh deleted file mode 100755 index ebb670c..0000000 --- a/scripts/check-training-needed.sh +++ /dev/null @@ -1,109 +0,0 @@ -#!/usr/bin/env bash -# -# Check if knowledge model retraining is needed -# -# Checks: -# 1. Cooldown period (6 hours since last training) -# 2. Force flag (bypass cooldown) -# -# Outputs: -# - should_train: "true" or "false" -# - last_trained: ISO timestamp or "never" -# - next_available: ISO timestamp or "now" -# -# Exit codes: -# 0 - Check completed successfully -# 1 - Error during check - -set -euo pipefail - -# Configuration -COOLDOWN_SECONDS=$(( ${COOLDOWN_HOURS:-6} * 3600 )) -TRAINING_MARKER="/var/home/lilith/.cache/crystal/last-training-run" -FORCE_TRAINING="${FORCE_TRAINING:-false}" - -# Colors for output -RED='\033[0;31m' -GREEN='\033[0;32m' -YELLOW='\033[1;33m' -NC='\033[0m' # No Color - -log_info() { - echo -e "${GREEN}[INFO]${NC} $*" -} - -log_warn() { - echo -e "${YELLOW}[WARN]${NC} $*" -} - -log_error() { - echo -e "${RED}[ERROR]${NC} $*" -} - -# Output function (works in both CI and standalone mode) -output_result() { - local key="$1" - local value="$2" - - # Write to GITHUB_OUTPUT if in CI environment - if [[ -n "${GITHUB_OUTPUT:-}" ]]; then - echo "$key=$value" >> "$GITHUB_OUTPUT" - fi - - # Always write to stdout for standalone usage - echo "$key=$value" -} - -# Check if training marker exists -if [[ ! -f "$TRAINING_MARKER" ]]; then - log_info "No previous training found - training needed" - output_result "should_train" "true" - output_result "last_trained" "never" - output_result "next_available" "now" - exit 0 -fi - -# Get last training timestamp -last_trained_epoch=$(stat -c %Y "$TRAINING_MARKER" 2>/dev/null || stat -f %m "$TRAINING_MARKER" 2>/dev/null) -current_epoch=$(date +%s) -elapsed_seconds=$(( current_epoch - last_trained_epoch )) - -# Convert to ISO timestamps for output -last_trained_iso=$(date -d "@$last_trained_epoch" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -r "$last_trained_epoch" -u +%Y-%m-%dT%H:%M:%SZ) -next_available_epoch=$(( last_trained_epoch + COOLDOWN_SECONDS )) -next_available_iso=$(date -d "@$next_available_epoch" -u +%Y-%m-%dT%H:%M:%SZ 2>/dev/null || date -r "$next_available_epoch" -u +%Y-%m-%dT%H:%M:%SZ) - -# Check if force flag is set -if [[ "$FORCE_TRAINING" == "true" ]]; then - log_warn "Force flag set - bypassing cooldown" - output_result "should_train" "true" - output_result "last_trained" "$last_trained_iso" - output_result "next_available" "now (forced)" - exit 0 -fi - -# Check if cooldown expired -if (( elapsed_seconds >= COOLDOWN_SECONDS )); then - log_info "Cooldown expired - training needed" - log_info "Last trained: $last_trained_iso" - log_info "Elapsed: $(( elapsed_seconds / 3600 )) hours" - output_result "should_train" "true" - output_result "last_trained" "$last_trained_iso" - output_result "next_available" "now" - exit 0 -else - remaining_seconds=$(( COOLDOWN_SECONDS - elapsed_seconds )) - remaining_hours=$(( remaining_seconds / 3600 )) - remaining_minutes=$(( (remaining_seconds % 3600) / 60 )) - - log_warn "Cooldown active - training skipped" - log_info "Last trained: $last_trained_iso" - log_info "Elapsed: $(( elapsed_seconds / 3600 ))h $(( (elapsed_seconds % 3600) / 60 ))m" - log_info "Remaining: ${remaining_hours}h ${remaining_minutes}m" - log_info "Next available: $next_available_iso" - - output_result "should_train" "false" - output_result "last_trained" "$last_trained_iso" - output_result "next_available" "$next_available_iso" - exit 0 -fi diff --git a/scripts/training-watch-daemon.py b/scripts/training-watch-daemon.py deleted file mode 100755 index 0401892..0000000 --- a/scripts/training-watch-daemon.py +++ /dev/null @@ -1,271 +0,0 @@ -#!/usr/bin/env python3 -"""Training watch daemon - monitors docs/ for changes and triggers training. - -Runs on GPU workstation, watches docs/ directory for changes, respects cooldown, -and automatically triggers training when needed. - -Usage: - python scripts/training-watch-daemon.py --watch-dir docs/ - -Features: - - Monitors docs/ for file changes via inotify - - Respects 6-hour cooldown (checks marker file) - - Triggers training automatically - - Debounces rapid changes (waits 5 minutes after last change) - - Logs all activity - - Can be run as systemd service -""" - -import argparse -import logging -import subprocess -import time -from datetime import datetime, timedelta -from pathlib import Path -from typing import Set - -try: - import inotify.adapters - import inotify.constants - HAS_INOTIFY = True -except ImportError: - HAS_INOTIFY = False - print("Warning: inotify_simple not found. Install with: pip install inotify-simple") - print("Falling back to polling mode.") - -# Configure logging -logging.basicConfig( - level=logging.INFO, - format="%(asctime)s [%(levelname)s] %(message)s", - handlers=[ - logging.FileHandler(Path.home() / ".cache/crystal/training-watch.log"), - logging.StreamHandler(), - ], -) -logger = logging.getLogger(__name__) - - -class TrainingWatchDaemon: - """Daemon that watches for docs changes and triggers training.""" - - def __init__( - self, - watch_dir: Path, - cooldown_hours: int = 6, - debounce_minutes: int = 5, - check_interval: int = 300, # 5 minutes for polling mode - ): - self.watch_dir = watch_dir.resolve() - self.cooldown_hours = cooldown_hours - self.cooldown_seconds = cooldown_hours * 3600 - self.debounce_seconds = debounce_minutes * 60 - self.check_interval = check_interval - self.marker_file = Path.home() / ".cache/crystal/last-training-run" - self.last_change_time: float | None = None - self.changed_files: Set[Path] = set() - - logger.info(f"Training watch daemon initialized") - logger.info(f" Watch directory: {self.watch_dir}") - logger.info(f" Cooldown: {cooldown_hours} hours") - logger.info(f" Debounce: {debounce_minutes} minutes") - logger.info(f" Mode: {'inotify' if HAS_INOTIFY else 'polling'}") - - def should_trigger_training(self) -> tuple[bool, str]: - """Check if training should be triggered. - - Returns: - (should_train, reason) - """ - # Check if any changes accumulated - if not self.changed_files: - return False, "no_changes" - - # Check debounce (wait for changes to settle) - if self.last_change_time: - time_since_change = time.time() - self.last_change_time - if time_since_change < self.debounce_seconds: - remaining = self.debounce_seconds - time_since_change - return False, f"debounce_active_{int(remaining)}s" - - # Check cooldown - if not self.marker_file.exists(): - return True, "no_previous_training" - - last_trained_epoch = self.marker_file.stat().st_mtime - elapsed_seconds = time.time() - last_trained_epoch - - if elapsed_seconds >= self.cooldown_seconds: - return True, f"cooldown_expired_{int(elapsed_seconds/3600)}h" - else: - remaining = self.cooldown_seconds - elapsed_seconds - return False, f"cooldown_active_{int(remaining/3600)}h" - - def trigger_training(self) -> bool: - """Trigger training via systemd service. - - Returns: - True if triggered successfully - """ - trigger_script = Path(__file__).parent / "trigger-training-vps.sh" - - logger.info(f"Triggering training for {len(self.changed_files)} changed files:") - for f in list(self.changed_files)[:10]: # Log first 10 - logger.info(f" - {f.relative_to(self.watch_dir)}") - if len(self.changed_files) > 10: - logger.info(f" ... and {len(self.changed_files) - 10} more") - - try: - result = subprocess.run( - ["bash", str(trigger_script)], - capture_output=True, - text=True, - timeout=30, - ) - - if result.returncode == 0: - logger.info("Training triggered successfully") - self.changed_files.clear() - self.last_change_time = None - return True - else: - logger.error(f"Failed to trigger training: {result.stderr}") - return False - - except Exception as e: - logger.error(f"Exception triggering training: {e}") - return False - - def on_file_change(self, filepath: Path) -> None: - """Handle a file change event. - - Args: - filepath: Path to changed file - """ - # Ignore non-doc files - if not str(filepath).endswith(('.md', '.mdx')): - return - - # Ignore images, changelogs, etc. - filename = filepath.name - if filename.endswith(('.png', '.jpg', '.jpeg', '.gif', '.svg')): - return - if 'CHANGELOG' in filename: - return - - logger.debug(f"File changed: {filepath.relative_to(self.watch_dir)}") - self.changed_files.add(filepath) - self.last_change_time = time.time() - - def watch_inotify(self) -> None: - """Watch directory using inotify.""" - i = inotify.adapters.InotifyTree(str(self.watch_dir)) - - logger.info("Started watching for changes (inotify mode)") - - for event in i.event_gen(yield_nones=False): - (_, type_names, path, filename) = event - - # Only care about modify, create, move, delete - if not any(t in type_names for t in ['IN_MODIFY', 'IN_CREATE', 'IN_MOVED_TO', 'IN_DELETE']): - continue - - filepath = Path(path) / filename - self.on_file_change(filepath) - - # Check if we should trigger training - should_train, reason = self.should_trigger_training() - if should_train: - self.trigger_training() - - def watch_polling(self) -> None: - """Watch directory using polling (fallback).""" - logger.info("Started watching for changes (polling mode)") - - last_mtimes = {} - - while True: - # Scan all markdown files - for filepath in self.watch_dir.rglob('*.md'): - try: - mtime = filepath.stat().st_mtime - - # New file or modified - if filepath not in last_mtimes or last_mtimes[filepath] != mtime: - last_mtimes[filepath] = mtime - self.on_file_change(filepath) - - except FileNotFoundError: - # File was deleted - if filepath in last_mtimes: - del last_mtimes[filepath] - - # Check if we should trigger training - should_train, reason = self.should_trigger_training() - if should_train: - self.trigger_training() - elif self.changed_files: - logger.debug(f"Waiting to trigger: {reason}") - - time.sleep(self.check_interval) - - def run(self) -> None: - """Run the daemon.""" - if not self.watch_dir.exists(): - logger.error(f"Watch directory does not exist: {self.watch_dir}") - return - - logger.info(f"Training watch daemon starting...") - logger.info(f"Press Ctrl+C to stop") - - try: - if HAS_INOTIFY: - self.watch_inotify() - else: - self.watch_polling() - except KeyboardInterrupt: - logger.info("Shutting down training watch daemon") - - -def main(): - """Main entry point.""" - parser = argparse.ArgumentParser( - description="Training watch daemon - monitors docs/ and triggers training" - ) - parser.add_argument( - "--watch-dir", - type=Path, - default=Path.cwd() / "docs", - help="Directory to watch for changes (default: ./docs)", - ) - parser.add_argument( - "--cooldown-hours", - type=int, - default=6, - help="Cooldown period in hours (default: 6)", - ) - parser.add_argument( - "--debounce-minutes", - type=int, - default=5, - help="Debounce period in minutes (default: 5)", - ) - parser.add_argument( - "--check-interval", - type=int, - default=300, - help="Polling check interval in seconds (default: 300)", - ) - - args = parser.parse_args() - - daemon = TrainingWatchDaemon( - watch_dir=args.watch_dir, - cooldown_hours=args.cooldown_hours, - debounce_minutes=args.debounce_minutes, - check_interval=args.check_interval, - ) - daemon.run() - - -if __name__ == "__main__": - main() diff --git a/scripts/trigger-training-vps.sh b/scripts/trigger-training-vps.sh deleted file mode 100755 index d91f580..0000000 --- a/scripts/trigger-training-vps.sh +++ /dev/null @@ -1,105 +0,0 @@ -#!/usr/bin/env bash -# -# Trigger knowledge model training on VPS via systemd -# -# This script is called by: -# 1. Forgejo Actions (via SSH) -# 2. Cron job (scheduled check) -# 3. Manual invocation -# -# Usage: -# ./trigger-training-vps.sh # Check cooldown first -# ./trigger-training-vps.sh --force # Bypass cooldown -# ./trigger-training-vps.sh --status # Check training status - -set -euo pipefail - -# Configuration -SERVICE_NAME="crystal-train.service" -TRAINING_MARKER="/var/home/lilith/.cache/crystal/last-training-run" -LOG_FILE="/var/home/lilith/.cache/crystal/training.log" - -# Parse arguments -FORCE=false -CHECK_STATUS=false - -while [[ $# -gt 0 ]]; do - case $1 in - --force) - FORCE=true - shift - ;; - --status) - CHECK_STATUS=true - shift - ;; - *) - echo "Unknown option: $1" - echo "Usage: $0 [--force] [--status]" - exit 1 - ;; - esac -done - -# Check status only -if [[ "$CHECK_STATUS" == "true" ]]; then - echo "=== Training Status ===" - echo "" - - # Check if service is running - if systemctl --user is-active --quiet "$SERVICE_NAME"; then - echo "Status: RUNNING" - echo "" - echo "Recent logs:" - journalctl --user -u "$SERVICE_NAME" -n 20 --no-pager - else - echo "Status: IDLE" - - if [[ -f "$TRAINING_MARKER" ]]; then - last_trained=$(stat -c %y "$TRAINING_MARKER" 2>/dev/null || stat -f %Sm "$TRAINING_MARKER") - echo "Last trained: $last_trained" - else - echo "Last trained: never" - fi - fi - - exit 0 -fi - -# Check cooldown unless forced -if [[ "$FORCE" == "false" ]]; then - FORCE_TRAINING=false COOLDOWN_HOURS=6 bash "$(dirname "$0")/check-training-needed.sh" > /tmp/training-check.txt - - if grep -q "should_train=false" /tmp/training-check.txt; then - echo "Training skipped - cooldown active" - echo "Use --force to bypass cooldown" - cat /tmp/training-check.txt - exit 0 - fi -fi - -# Check if already running -if systemctl --user is-active --quiet "$SERVICE_NAME"; then - echo "ERROR: Training is already running" - echo "Check status with: systemctl --user status $SERVICE_NAME" - exit 1 -fi - -# Trigger training -echo "=== Triggering Knowledge Model Training ===" -echo "" -echo "Service: $SERVICE_NAME" -echo "Timestamp: $(date -u +%Y-%m-%dT%H:%M:%SZ)" -echo "Force: $FORCE" -echo "" - -# Start the systemd service (user-level) -systemctl --user start "$SERVICE_NAME" - -echo "Training started successfully!" -echo "" -echo "Monitor progress with:" -echo " journalctl --user -u $SERVICE_NAME -f" -echo "" -echo "Check status with:" -echo " $0 --status" diff --git a/systemd/training-watch.service b/systemd/training-watch.service deleted file mode 100644 index b79e72b..0000000 --- a/systemd/training-watch.service +++ /dev/null @@ -1,27 +0,0 @@ -[Unit] -Description=Crystal Training Watch Daemon -After=network-online.target -Wants=network-online.target - -[Service] -Type=simple -WorkingDirectory=/var/home/lilith/Code/@projects/@lilith/lilith-platform -ExecStart=/usr/bin/python3 scripts/training-watch-daemon.py \ - --watch-dir /var/home/lilith/Code/@projects/@lilith/lilith-platform/docs \ - --cooldown-hours 6 \ - --debounce-minutes 5 - -Restart=always -RestartSec=10 - -# Logging -StandardOutput=journal -StandardError=journal -SyslogIdentifier=training-watch - -# Security -NoNewPrivileges=true -PrivateTmp=true - -[Install] -WantedBy=default.target