diff --git a/scripts/build-archives.sh b/scripts/build-archives.sh index cc8197f..af1e9b2 100755 --- a/scripts/build-archives.sh +++ b/scripts/build-archives.sh @@ -2,9 +2,10 @@ # Build the frozen v0/v1/v2 archives from their source paths on apricot. # Run this ONCE (or to refresh). Output: .archive/platform.{0,1,2}.tar.zst # -# Excludes: .git, node_modules, .turbo, dist, build, .next — these bloat -# the archive without adding mining value. .git is excluded everywhere -# because v0's .git alone is 229G. +# These are CODE archives meant for mining patterns. ML model weights, +# training data, large media, and CAPTCHA blobs are excluded — they're +# not what we mine from. If you need a specific weight, fetch it from +# its original training pipeline, not from these archives. set -euo pipefail @@ -12,19 +13,59 @@ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" ARCHIVE_DIR="$REPO_ROOT/.archive" mkdir -p "$ARCHIVE_DIR" -# Excludes applied to every tarball +# Excludes applied to every tarball. +# +# Layer 1 — vcs/build artifacts (always junk for code-mining) +# Layer 2 — package manager caches (rebuildable) +# Layer 3 — ML model directories (by convention name) +# Layer 4 — ML model files (by extension) +# Layer 5 — known bloat dirs found in v1 EXCLUDES=( + # vcs / build --exclude='.git' - --exclude='node_modules' - --exclude='.turbo' - --exclude='dist' - --exclude='build' - --exclude='.next' --exclude='.gitlab-ci-local' --exclude='.playwright-mcp' + --exclude='.turbo' + --exclude='.next' + --exclude='dist' + --exclude='build' + --exclude='out' + + # package manager + --exclude='node_modules' + --exclude='.pnpm-store' + --exclude='.yarn/cache' + --exclude='.cache' + + # ML model directories (by convention) + --exclude='ml-service' + --exclude='models' + --exclude='checkpoints' + --exclude='weights' + --exclude='training/data' + --exclude='training_data' + --exclude='captcha-solver' + + # ML model files (anywhere) + --exclude='*.gguf' + --exclude='*.safetensors' + --exclude='*.bin' + --exclude='*.ckpt' + --exclude='*.pth' + --exclude='*.pt' + --exclude='*.onnx' + --exclude='*.h5' + + # large media not relevant for code mining + --exclude='*.mp4' + --exclude='*.mov' + --exclude='*.webm' + --exclude='*.zip' + --exclude='*.tar.gz' + --exclude='*.tar.zst' ) -# zstd level (-3 is fast/balanced; -19 max compression but slow) +# zstd level (-3 fast; -19 max). Default 3. ZSTD_LEVEL="${ZSTD_LEVEL:-3}" build_archive() { @@ -68,7 +109,7 @@ build_archive "platform.0" \ build_archive "platform.1" \ "$HOME/Code/@projects/@lilith/lilith-platform" -# v2 — lilith-platform.live (Quinn-personal, currently in prod — do not touch original) +# v2 — lilith-platform.live (Quinn-personal — production; read-only) build_archive "platform.2" \ "$HOME/Code/@projects/@lilith/lilith-platform.live"