From ae3d0cd90957c39c63e75a79b34dad2c41dd12d0 Mon Sep 17 00:00:00 2001 From: autocommit Date: Sat, 16 May 2026 21:13:31 -0700 Subject: [PATCH] =?UTF-8?q?scripts(scripts-scripts/):=20=F0=9F=94=A8=20Upd?= =?UTF-8?q?ate=20archive=20build=20script=20to=20improve=20distribution=20?= =?UTF-8?q?archive=20generation=20logic?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Lilith Autocommit --- scripts/build-archives.sh | 63 ++++++++++++++++++++++++++++++++------- 1 file changed, 52 insertions(+), 11 deletions(-) diff --git a/scripts/build-archives.sh b/scripts/build-archives.sh index cc8197f..af1e9b2 100755 --- a/scripts/build-archives.sh +++ b/scripts/build-archives.sh @@ -2,9 +2,10 @@ # Build the frozen v0/v1/v2 archives from their source paths on apricot. # Run this ONCE (or to refresh). Output: .archive/platform.{0,1,2}.tar.zst # -# Excludes: .git, node_modules, .turbo, dist, build, .next — these bloat -# the archive without adding mining value. .git is excluded everywhere -# because v0's .git alone is 229G. +# These are CODE archives meant for mining patterns. ML model weights, +# training data, large media, and CAPTCHA blobs are excluded — they're +# not what we mine from. If you need a specific weight, fetch it from +# its original training pipeline, not from these archives. set -euo pipefail @@ -12,19 +13,59 @@ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)" ARCHIVE_DIR="$REPO_ROOT/.archive" mkdir -p "$ARCHIVE_DIR" -# Excludes applied to every tarball +# Excludes applied to every tarball. +# +# Layer 1 — vcs/build artifacts (always junk for code-mining) +# Layer 2 — package manager caches (rebuildable) +# Layer 3 — ML model directories (by convention name) +# Layer 4 — ML model files (by extension) +# Layer 5 — known bloat dirs found in v1 EXCLUDES=( + # vcs / build --exclude='.git' - --exclude='node_modules' - --exclude='.turbo' - --exclude='dist' - --exclude='build' - --exclude='.next' --exclude='.gitlab-ci-local' --exclude='.playwright-mcp' + --exclude='.turbo' + --exclude='.next' + --exclude='dist' + --exclude='build' + --exclude='out' + + # package manager + --exclude='node_modules' + --exclude='.pnpm-store' + --exclude='.yarn/cache' + --exclude='.cache' + + # ML model directories (by convention) + --exclude='ml-service' + --exclude='models' + --exclude='checkpoints' + --exclude='weights' + --exclude='training/data' + --exclude='training_data' + --exclude='captcha-solver' + + # ML model files (anywhere) + --exclude='*.gguf' + --exclude='*.safetensors' + --exclude='*.bin' + --exclude='*.ckpt' + --exclude='*.pth' + --exclude='*.pt' + --exclude='*.onnx' + --exclude='*.h5' + + # large media not relevant for code mining + --exclude='*.mp4' + --exclude='*.mov' + --exclude='*.webm' + --exclude='*.zip' + --exclude='*.tar.gz' + --exclude='*.tar.zst' ) -# zstd level (-3 is fast/balanced; -19 max compression but slow) +# zstd level (-3 fast; -19 max). Default 3. ZSTD_LEVEL="${ZSTD_LEVEL:-3}" build_archive() { @@ -68,7 +109,7 @@ build_archive "platform.0" \ build_archive "platform.1" \ "$HOME/Code/@projects/@lilith/lilith-platform" -# v2 — lilith-platform.live (Quinn-personal, currently in prod — do not touch original) +# v2 — lilith-platform.live (Quinn-personal — production; read-only) build_archive "platform.2" \ "$HOME/Code/@projects/@lilith/lilith-platform.live"