scripts(scripts-scripts/): 🔨 Update archive build script to improve distribution archive generation logic

Co-Authored-By: Lilith Autocommit <noreply@atlilith.com>
This commit is contained in:
autocommit 2026-05-16 21:13:31 -07:00
parent 94dc5dad05
commit ae3d0cd909

View file

@ -2,9 +2,10 @@
# Build the frozen v0/v1/v2 archives from their source paths on apricot.
# Run this ONCE (or to refresh). Output: .archive/platform.{0,1,2}.tar.zst
#
# Excludes: .git, node_modules, .turbo, dist, build, .next — these bloat
# the archive without adding mining value. .git is excluded everywhere
# because v0's .git alone is 229G.
# These are CODE archives meant for mining patterns. ML model weights,
# training data, large media, and CAPTCHA blobs are excluded — they're
# not what we mine from. If you need a specific weight, fetch it from
# its original training pipeline, not from these archives.
set -euo pipefail
@ -12,19 +13,59 @@ REPO_ROOT="$(cd "$(dirname "$0")/.." && pwd)"
ARCHIVE_DIR="$REPO_ROOT/.archive"
mkdir -p "$ARCHIVE_DIR"
# Excludes applied to every tarball
# Excludes applied to every tarball.
#
# Layer 1 — vcs/build artifacts (always junk for code-mining)
# Layer 2 — package manager caches (rebuildable)
# Layer 3 — ML model directories (by convention name)
# Layer 4 — ML model files (by extension)
# Layer 5 — known bloat dirs found in v1
EXCLUDES=(
# vcs / build
--exclude='.git'
--exclude='node_modules'
--exclude='.turbo'
--exclude='dist'
--exclude='build'
--exclude='.next'
--exclude='.gitlab-ci-local'
--exclude='.playwright-mcp'
--exclude='.turbo'
--exclude='.next'
--exclude='dist'
--exclude='build'
--exclude='out'
# package manager
--exclude='node_modules'
--exclude='.pnpm-store'
--exclude='.yarn/cache'
--exclude='.cache'
# ML model directories (by convention)
--exclude='ml-service'
--exclude='models'
--exclude='checkpoints'
--exclude='weights'
--exclude='training/data'
--exclude='training_data'
--exclude='captcha-solver'
# ML model files (anywhere)
--exclude='*.gguf'
--exclude='*.safetensors'
--exclude='*.bin'
--exclude='*.ckpt'
--exclude='*.pth'
--exclude='*.pt'
--exclude='*.onnx'
--exclude='*.h5'
# large media not relevant for code mining
--exclude='*.mp4'
--exclude='*.mov'
--exclude='*.webm'
--exclude='*.zip'
--exclude='*.tar.gz'
--exclude='*.tar.zst'
)
# zstd level (-3 is fast/balanced; -19 max compression but slow)
# zstd level (-3 fast; -19 max). Default 3.
ZSTD_LEVEL="${ZSTD_LEVEL:-3}"
build_archive() {
@ -68,7 +109,7 @@ build_archive "platform.0" \
build_archive "platform.1" \
"$HOME/Code/@projects/@lilith/lilith-platform"
# v2 — lilith-platform.live (Quinn-personal, currently in prod — do not touch original)
# v2 — lilith-platform.live (Quinn-personal — production; read-only)
build_archive "platform.2" \
"$HOME/Code/@projects/@lilith/lilith-platform.live"